diff --git a/docs/docs/append-table/data-evolution.md b/docs/docs/append-table/data-evolution.md index fe71e7841581..f8a93dbd31a2 100644 --- a/docs/docs/append-table/data-evolution.md +++ b/docs/docs/append-table/data-evolution.md @@ -149,6 +149,112 @@ Note that: * `_ROW_ID` is only available via the `$row_tracking` system table. * Self-merge only supports `WHEN MATCHED THEN UPDATE` semantics. +## Streaming Upsert + +Data Evolution also supports streaming upsert for Append tables. By specifying `data-evolution.upsert-keys`, Paimon +classifies incoming records as INSERT or UPDATE based on a business key lookup, and writes them accordingly. + +### Basic Usage + +```sql +CREATE TABLE T (id INT, name STRING, `value` DOUBLE) WITH ( + 'row-tracking.enabled' = 'true', + 'data-evolution.enabled' = 'true', + 'bucket' = '-1' +); + +-- Initial batch load +INSERT INTO T VALUES (1, 'a', 1.0), (2, 'b', 2.0), (3, 'c', 3.0); + +-- Streaming upsert: update existing rows and insert new ones +INSERT INTO T /*+ OPTIONS('data-evolution.upsert-keys'='id') */ +VALUES (1, 'a_new', 10.0), (4, 'd', 4.0); + +SELECT * FROM T ORDER BY id; ++----+-------+-------+ +| id | name | value | ++----+-------+-------+ +| 1 | a_new | 10.0 | +| 2 | b | 2.0 | +| 3 | c | 3.0 | +| 4 | d | 4.0 | +``` + +### Partial Column Update + +Streaming upsert supports **partial column updates based on NULL detection**. When an update row has NULL in certain +columns, those NULLs are treated as "don't change" rather than actual NULL values. Only the non-NULL columns are +written to a new partial file, and the original files are kept for merge-on-read. + +This is especially useful when you only need to update a few columns without touching the rest. Simply specify +only the columns you want to update in the `INSERT INTO` column list — unspecified columns are treated as "don't change": + +```sql +-- Update only 'name' column; 'value' is not specified, meaning "don't change" +INSERT INTO T (id, name) /*+ OPTIONS('data-evolution.upsert-keys'='id') */ +VALUES (1, 'a_updated'), (3, 'c_updated'); + +SELECT * FROM T ORDER BY id; ++----+-----------+-------+ +| id | name | value | ++----+-----------+-------+ +| 1 | a_updated | 1.0 | +| 2 | b | 2.0 | +| 3 | c_updated | 3.0 | +``` + +In this example, only `id` and `name` are written to the new file. The `value` column data is read from the original +files during merge-on-read, avoiding unnecessary I/O. + +You can also perform successive partial updates on different column sets: + +```sql +-- First: update only 'name' +INSERT INTO T (id, name) /*+ OPTIONS('data-evolution.upsert-keys'='id') */ +VALUES (1, 'name_v2'); + +-- Second: update only 'value' +INSERT INTO T (id, `value`) /*+ OPTIONS('data-evolution.upsert-keys'='id') */ +VALUES (1, 100.0); + +SELECT * FROM T WHERE id = 1; ++----+---------+-------+ +| id | name | value | ++----+---------+-------+ +| 1 | name_v2 | 100.0 | +``` + +Note that: +* NULL in an update row means "don't change this column". You cannot explicitly set a column to NULL via this mechanism. +* If all columns are non-NULL, the update performs a full rewrite of the file (original files are replaced). +* Partial column updates and new inserts can be mixed in the same upsert batch. + +### Index Parallelism + +Streaming upsert internally maintains a business-key-to-row-id index for classifying records as INSERT or UPDATE. +By default, each partition's index is loaded on exactly one subtask (`data-evolution.upsert-index-parallelism = 1`), +which minimizes memory usage since the index is not duplicated. However, this means all records for a given partition +are processed by a single subtask, which can become a throughput bottleneck for large partitions. + +You can increase `data-evolution.upsert-index-parallelism` to spread the load across more subtasks per partition. +Each partition can then be distributed to up to `indexParallelism` subtasks, improving throughput at the cost of +loading the index multiple times: + +```sql +INSERT INTO T /*+ OPTIONS( + 'data-evolution.upsert-keys'='id', + 'data-evolution.upsert-index-parallelism'='4' +) */ VALUES ...; +``` + +| indexParallelism | Behavior | +|:---|:---| +| 1 (default) | Each partition maps to exactly one subtask. Index loaded once per partition, lowest memory usage. | +| N | Each partition can be distributed to up to N subtasks. Higher throughput, but the index is loaded N times, increasing memory usage. | + +Choosing the right value depends on your workload: use the default for partitions with moderate data volume, +and increase it when a single subtask cannot keep up with the incoming record rate. + ## File Group Spec Through the RowId metadata, files are organized into a file group. diff --git a/docs/generated/catalog_configuration.html b/docs/generated/catalog_configuration.html index 2a5f25ef4370..74053392d1ca 100644 --- a/docs/generated/catalog_configuration.html +++ b/docs/generated/catalog_configuration.html @@ -1,3 +1,21 @@ + diff --git a/docs/generated/cdc_configuration.html b/docs/generated/cdc_configuration.html index 989440f124a7..a6ff6ba4754a 100644 --- a/docs/generated/cdc_configuration.html +++ b/docs/generated/cdc_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/core_configuration.html b/docs/generated/core_configuration.html index 5d608dba9ed1..018cccc10a7b 100644 --- a/docs/generated/core_configuration.html +++ b/docs/generated/core_configuration.html @@ -1,3 +1,21 @@ +
@@ -44,6 +62,12 @@ + + + + + + @@ -434,18 +458,24 @@ - - - - - - + + + + + + + + + + + + diff --git a/docs/generated/flink_catalog_configuration.html b/docs/generated/flink_catalog_configuration.html index 91c565e347f9..6241c224eaf8 100644 --- a/docs/generated/flink_catalog_configuration.html +++ b/docs/generated/flink_catalog_configuration.html @@ -1,3 +1,21 @@ +
Boolean Write blob field using blob descriptor rather than blob bytes.
blob-compaction.enabled
falseBooleanWhether to compact blob files when compacting a data evolution table.
blob-descriptor-field
(none)Duration The TTL in rocksdb index for cross partition upsert (primary keys not contain all partition fields), this can avoid maintaining too many indexes and lead to worse and worse performance, but please note that this may also cause data duplication.
blob-compaction.enabled
falseBooleanWhether to compact blob files when compacting a data evolution table.
data-evolution.enabled
false Boolean Whether enable data evolution for row tracking table.
data-evolution.upsert-index-parallelism
1IntegerThe number of subtasks each partition can be distributed to in the upsert classify phase. Higher values increase throughput but also increase index memory usage.
data-evolution.upsert-keys
(none)StringComma-separated list of column names used as business key for streaming upsert in data evolution mode. When set, incoming records are deduplicated by this key: existing rows are updated via partial write, new rows are appended.
data-file.external-paths
(none)
diff --git a/docs/generated/flink_connector_configuration.html b/docs/generated/flink_connector_configuration.html index 347473e479bb..686349e7bdfc 100644 --- a/docs/generated/flink_connector_configuration.html +++ b/docs/generated/flink_connector_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/hive_catalog_configuration.html b/docs/generated/hive_catalog_configuration.html index 66e39f6a4ff2..48adc1114fb3 100644 --- a/docs/generated/hive_catalog_configuration.html +++ b/docs/generated/hive_catalog_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/hive_connector_configuration.html b/docs/generated/hive_connector_configuration.html index a3aac1d1b16e..3e0d7fb2282f 100644 --- a/docs/generated/hive_connector_configuration.html +++ b/docs/generated/hive_connector_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/iceberg_configuration.html b/docs/generated/iceberg_configuration.html index e0e6d9126f01..c08388c8cbc4 100644 --- a/docs/generated/iceberg_configuration.html +++ b/docs/generated/iceberg_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/jdbc_catalog_configuration.html b/docs/generated/jdbc_catalog_configuration.html index 9ec21bc1c465..9b36f3f37257 100644 --- a/docs/generated/jdbc_catalog_configuration.html +++ b/docs/generated/jdbc_catalog_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/orc_configuration.html b/docs/generated/orc_configuration.html index 62cd8ddd8754..c5d5d7d42075 100644 --- a/docs/generated/orc_configuration.html +++ b/docs/generated/orc_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/rocksdb_configuration.html b/docs/generated/rocksdb_configuration.html index 3ba853bf4382..d24f2f379f27 100644 --- a/docs/generated/rocksdb_configuration.html +++ b/docs/generated/rocksdb_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/spark_catalog_configuration.html b/docs/generated/spark_catalog_configuration.html index d8b0674b06b8..f09bfed60e91 100644 --- a/docs/generated/spark_catalog_configuration.html +++ b/docs/generated/spark_catalog_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/docs/generated/spark_connector_configuration.html b/docs/generated/spark_connector_configuration.html index c4ed50b8a2f5..a6c8278f4c44 100644 --- a/docs/generated/spark_connector_configuration.html +++ b/docs/generated/spark_connector_configuration.html @@ -1,3 +1,21 @@ +
diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java index 9de20d19cdb5..2c0326b54e13 100644 --- a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java +++ b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java @@ -2225,6 +2225,25 @@ public InlineElement getDescription() { .defaultValue(false) .withDescription("Whether enable data evolution for row tracking table."); + public static final ConfigOption DATA_EVOLUTION_UPSERT_KEYS = + key("data-evolution.upsert-keys") + .stringType() + .noDefaultValue() + .withDescription( + "Comma-separated list of column names used as business key for " + + "streaming upsert in data evolution mode. When set, incoming " + + "records are deduplicated by this key: existing rows are " + + "updated via partial write, new rows are appended."); + + public static final ConfigOption DATA_EVOLUTION_UPSERT_INDEX_PARALLELISM = + key("data-evolution.upsert-index-parallelism") + .intType() + .defaultValue(1) + .withDescription( + "The number of subtasks each partition can be distributed to " + + "in the upsert classify phase. Higher values increase " + + "throughput but also increase index memory usage."); + public static final ConfigOption BLOB_COMPACTION_ENABLED = key("blob-compaction.enabled") .booleanType() @@ -3698,6 +3717,18 @@ public boolean dataEvolutionEnabled() { return options.get(DATA_EVOLUTION_ENABLED); } + public List dataEvolutionUpsertKeyColumns() { + String keys = options.get(DATA_EVOLUTION_UPSERT_KEYS); + if (keys == null || keys.trim().isEmpty()) { + return Collections.emptyList(); + } + return Arrays.asList(keys.split(",")); + } + + public int dataEvolutionUpsertIndexParallelism() { + return options.get(DATA_EVOLUTION_UPSERT_INDEX_PARALLELISM); + } + public boolean blobCompactionEnabled() { return options.get(BLOB_COMPACTION_ENABLED); } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertClassifyOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertClassifyOperator.java new file mode 100644 index 000000000000..0e5c0ba4adfe --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertClassifyOperator.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.serializer.InternalRowSerializer; +import org.apache.paimon.data.serializer.RowCompactedSerializer; +import org.apache.paimon.disk.IOManager; +import org.apache.paimon.disk.IOManagerImpl; +import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowKind; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.Preconditions; +import org.apache.paimon.utils.SnapshotManager; + +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +/** + * Phase 1 operator for data evolution streaming upsert. Maintains a {@link UpsertKeyIndex} per + * partition, classifies each record as UPDATE or INSERT, and emits tagged {@link UpsertRecord}s + * downstream for the Phase 2 write operator. + */ +public class UpsertClassifyOperator extends AbstractStreamOperator + implements OneInputStreamOperator, BoundedOneInput { + + private static final Logger LOG = LoggerFactory.getLogger(UpsertClassifyOperator.class); + + private final FileStoreTable table; + private final List upsertKeyColumns; + + private transient RowType upsertKeyType; + private transient RowCompactedSerializer keySerializer; + private transient InternalRow.FieldGetter[] keyFieldGetters; + private transient InternalRow.FieldGetter[] partitionFieldGetters; + private transient IOManager paimonIOManager; + private transient InternalRowSerializer rowSerializer; + private transient InternalRowSerializer partitionSerializer; + + private transient Map partitionIndices; + private transient Map> buffer; + private transient long lastSyncedSnapshotId; + private transient SnapshotManager snapshotManager; + private transient File indexBaseDir; + + public UpsertClassifyOperator( + StreamOperatorParameters parameters, + FileStoreTable table, + List upsertKeyColumns) { + setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); + this.table = table; + this.upsertKeyColumns = upsertKeyColumns; + } + + @Override + public void open() throws Exception { + super.open(); + + RowType tableRowType = table.rowType(); + + List keyFields = new ArrayList<>(); + int[] keyFieldIndices = new int[upsertKeyColumns.size()]; + for (int i = 0; i < upsertKeyColumns.size(); i++) { + int idx = tableRowType.getFieldIndex(upsertKeyColumns.get(i)); + Preconditions.checkArgument( + idx >= 0, "Upsert key column not found: " + upsertKeyColumns.get(i)); + keyFieldIndices[i] = idx; + keyFields.add(tableRowType.getFields().get(idx)); + } + this.upsertKeyType = new RowType(keyFields); + this.keySerializer = new RowCompactedSerializer(upsertKeyType); + this.keyFieldGetters = new InternalRow.FieldGetter[upsertKeyColumns.size()]; + for (int i = 0; i < upsertKeyColumns.size(); i++) { + keyFieldGetters[i] = + InternalRow.createFieldGetter( + tableRowType.getTypeAt(keyFieldIndices[i]), keyFieldIndices[i]); + } + + List partitionKeys = table.partitionKeys(); + this.partitionFieldGetters = new InternalRow.FieldGetter[partitionKeys.size()]; + for (int i = 0; i < partitionKeys.size(); i++) { + int idx = tableRowType.getFieldIndex(partitionKeys.get(i)); + partitionFieldGetters[i] = + InternalRow.createFieldGetter(tableRowType.getTypeAt(idx), idx); + } + if (partitionKeys.isEmpty()) { + this.partitionSerializer = null; + } else { + RowType partRowType = tableRowType.project(partitionKeys.toArray(new String[0])); + this.partitionSerializer = new InternalRowSerializer(partRowType); + } + + this.rowSerializer = new InternalRowSerializer(tableRowType); + this.partitionIndices = new HashMap<>(); + this.buffer = new HashMap<>(); + + this.snapshotManager = table.store().snapshotManager(); + Long latestId = snapshotManager.latestSnapshotId(); + this.lastSyncedSnapshotId = latestId != null ? latestId : -1; + + this.indexBaseDir = + new File( + getContainingTask() + .getEnvironment() + .getIOManager() + .getSpillingDirectories()[0], + "data-evolution-upsert-classify-" + UUID.randomUUID()); + if (!indexBaseDir.mkdirs()) { + throw new IOException("Failed to create index directory: " + indexBaseDir); + } + + this.paimonIOManager = + new IOManagerImpl( + getContainingTask() + .getEnvironment() + .getIOManager() + .getSpillingDirectories()[0] + .getPath()); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + InternalRow row = element.getValue(); + RowKind kind = row.getRowKind(); + Preconditions.checkArgument( + kind == RowKind.INSERT || kind == RowKind.UPDATE_AFTER, + "Data evolution upsert only supports +I and +U, but got: %s", + kind); + BinaryRow partition = extractPartition(row); + BytesKey keyBytes = extractKeyBytes(row); + InternalRow copiedRow = rowSerializer.copy(row); + + buffer.computeIfAbsent(partition, k -> new LinkedHashMap<>()).put(keyBytes, copiedRow); + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + flushBuffer(); + } + + @Override + public void endInput() throws Exception { + flushBuffer(); + } + + private void flushBuffer() throws Exception { + incrementalSyncIndex(); + + for (Map.Entry> partitionEntry : + buffer.entrySet()) { + BinaryRow partition = partitionEntry.getKey(); + LinkedHashMap keyRows = partitionEntry.getValue(); + UpsertKeyIndex index = getOrBootstrap(partition); + + for (Map.Entry entry : keyRows.entrySet()) { + byte[] keyBytes = entry.getKey().bytes; + InternalRow row = entry.getValue(); + + Long rowId = index.lookupRowId(keyBytes); + if (rowId != null) { + long firstRowId = index.lookupFirstRowId(rowId); + long offset = rowId - firstRowId; + output.collect( + new StreamRecord<>( + new UpsertRecord(partition, firstRowId, offset, row))); + } else { + output.collect(new StreamRecord<>(new UpsertRecord(partition, -1, -1, row))); + } + } + } + buffer.clear(); + } + + @Override + public void close() throws Exception { + super.close(); + for (UpsertKeyIndex index : partitionIndices.values()) { + index.close(); + } + partitionIndices.clear(); + if (paimonIOManager != null) { + paimonIOManager.close(); + } + } + + private void incrementalSyncIndex() throws Exception { + Long latestId = snapshotManager.latestSnapshotIdFromFileSystem(); + if (latestId == null || latestId <= lastSyncedSnapshotId) { + return; + } + + for (Map.Entry entry : partitionIndices.entrySet()) { + entry.getValue().incrementalSync(table, entry.getKey(), lastSyncedSnapshotId, latestId); + } + + lastSyncedSnapshotId = latestId; + } + + private BinaryRow extractPartition(InternalRow row) { + if (partitionFieldGetters.length == 0) { + return BinaryRow.EMPTY_ROW; + } + GenericRow partRow = new GenericRow(partitionFieldGetters.length); + for (int i = 0; i < partitionFieldGetters.length; i++) { + partRow.setField(i, partitionFieldGetters[i].getFieldOrNull(row)); + } + return partitionSerializer.toBinaryRow(partRow).copy(); + } + + private BytesKey extractKeyBytes(InternalRow row) { + GenericRow keyRow = new GenericRow(keyFieldGetters.length); + for (int i = 0; i < keyFieldGetters.length; i++) { + keyRow.setField(i, keyFieldGetters[i].getFieldOrNull(row)); + } + return new BytesKey(keySerializer.serializeToBytes(keyRow)); + } + + private UpsertKeyIndex getOrBootstrap(BinaryRow partition) throws Exception { + UpsertKeyIndex index = partitionIndices.get(partition); + if (index != null) { + return index; + } + + File partDir = new File(indexBaseDir, "part-" + UUID.randomUUID()); + if (!partDir.mkdirs()) { + throw new IOException("Failed to create partition index dir: " + partDir); + } + index = new UpsertKeyIndex(partDir, upsertKeyType); + + if (lastSyncedSnapshotId > 0) { + CoreOptions coreOptions = table.coreOptions(); + index.bootstrap( + table, + partition, + lastSyncedSnapshotId, + paimonIOManager, + coreOptions.writeBufferSize() / 2, + coreOptions.pageSize(), + coreOptions.localSortMaxNumFileHandles(), + coreOptions.spillCompressOptions()); + LOG.info("Bootstrapped upsert key index for partition {}", partition); + } + + partitionIndices.put(partition, index); + return index; + } + + /** Wrapper for byte[] with proper equals/hashCode. */ + static final class BytesKey { + final byte[] bytes; + + BytesKey(byte[] bytes) { + this.bytes = bytes; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BytesKey)) { + return false; + } + return Arrays.equals(bytes, ((BytesKey) o).bytes); + } + + @Override + public int hashCode() { + return Arrays.hashCode(bytes); + } + } + + /** Factory for creating {@link UpsertClassifyOperator}. */ + public static class Factory extends AbstractStreamOperatorFactory + implements OneInputStreamOperatorFactory { + + private final FileStoreTable table; + private final List upsertKeyColumns; + + public Factory(FileStoreTable table, List upsertKeyColumns) { + this.table = table; + this.upsertKeyColumns = upsertKeyColumns; + this.chainingStrategy = ChainingStrategy.ALWAYS; + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator( + StreamOperatorParameters parameters) { + return (T) new UpsertClassifyOperator(parameters, table, upsertKeyColumns); + } + + @SuppressWarnings("rawtypes") + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return UpsertClassifyOperator.class; + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertKeyIndex.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertKeyIndex.java new file mode 100644 index 000000000000..13c508e78c9f --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertKeyIndex.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.compression.CompressOptions; +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalRow.FieldGetter; +import org.apache.paimon.data.serializer.RowCompactedSerializer; +import org.apache.paimon.disk.IOManager; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.lookup.sort.db.SimpleLsmKvDb; +import org.apache.paimon.manifest.ManifestEntry; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.sort.BinaryExternalSortBuffer; +import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.SpecialFields; +import org.apache.paimon.table.source.DataSplit; +import org.apache.paimon.table.source.InnerTableRead; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.utils.MutableObjectIterator; +import org.apache.paimon.utils.VarLengthIntUtils; + +import javax.annotation.Nullable; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.apache.paimon.format.blob.BlobFileFormat.isBlobFile; +import static org.apache.paimon.types.VectorType.isVectorStoreFile; + +/** + * Manages a business-key-to-{@code _ROW_ID} index for data evolution streaming upsert. Uses {@link + * SimpleLsmKvDb} for the underlying storage, following the same pattern as {@code + * ClusteringKeyIndex}. + */ +public class UpsertKeyIndex implements Closeable { + + private final SimpleLsmKvDb kvDb; + private final RowType upsertKeyType; + private final RowCompactedSerializer keySerializer; + private final FieldGetter[] keyFieldGetters; + private final Map> firstIdToFiles; + + private FirstRowIdLookup firstRowIdLookup; + private InnerTableRead cachedTableRead; + + public UpsertKeyIndex(File dbDir, RowType upsertKeyType) { + this.upsertKeyType = upsertKeyType; + this.keySerializer = new RowCompactedSerializer(upsertKeyType); + this.kvDb = + SimpleLsmKvDb.builder(dbDir) + .keyComparator(keySerializer.createSliceComparator()) + .build(); + this.firstIdToFiles = new HashMap<>(); + + int keyFieldCount = upsertKeyType.getFieldCount(); + this.keyFieldGetters = new FieldGetter[keyFieldCount]; + for (int i = 0; i < keyFieldCount; i++) { + keyFieldGetters[i] = InternalRow.createFieldGetter(upsertKeyType.getTypeAt(i), i); + } + } + + /** + * Bootstrap the index from existing data files for a given partition. Scans all files, reads + * (upsert_key, _ROW_ID) pairs, sorts externally, and bulk-loads into the KvDb. + */ + public void bootstrap( + FileStoreTable table, + BinaryRow partition, + long snapshotId, + IOManager ioManager, + long sortBufferSize, + int pageSize, + int maxNumFileHandles, + CompressOptions compression) + throws Exception { + List entries = scanPartitionFiles(table, partition, snapshotId); + buildFileMetadata(entries); + + if (entries.isEmpty()) { + return; + } + + int keyFieldCount = upsertKeyType.getFieldCount(); + + List combinedFields = new ArrayList<>(); + for (int i = 0; i < keyFieldCount; i++) { + DataField kf = upsertKeyType.getFields().get(i); + combinedFields.add(new DataField(i, kf.name(), kf.type())); + } + combinedFields.add( + new DataField(keyFieldCount, "_value_bytes", new VarBinaryType(Integer.MAX_VALUE))); + RowType combinedType = new RowType(combinedFields); + + int[] sortFields = IntStream.range(0, keyFieldCount).toArray(); + BinaryExternalSortBuffer sortBuffer = + BinaryExternalSortBuffer.create( + ioManager, + combinedType, + sortFields, + sortBufferSize, + pageSize, + maxNumFileHandles, + compression, + MemorySize.MAX_VALUE); + + try { + InnerTableRead tableRead = getOrCreateTableRead(table); + + for (Map.Entry> fileGroup : firstIdToFiles.entrySet()) { + long firstRowId = fileGroup.getKey(); + List files = fileGroup.getValue(); + DataSplit split = + DataSplit.builder() + .withPartition(partition) + .withBucket(0) + .withDataFiles(files) + .withBucketPath( + table.store() + .pathFactory() + .bucketPath(partition, 0) + .toString()) + .rawConvertible(false) + .build(); + + long offset = 0; + try (RecordReader reader = tableRead.createReader(split)) { + RecordReader.RecordIterator batch; + while ((batch = reader.readBatch()) != null) { + InternalRow row; + while ((row = batch.next()) != null) { + long rowId = firstRowId + offset; + byte[] value = encodeRowId(rowId); + + GenericRow combinedRow = new GenericRow(combinedType.getFieldCount()); + for (int i = 0; i < keyFieldCount; i++) { + combinedRow.setField(i, keyFieldGetters[i].getFieldOrNull(row)); + } + combinedRow.setField(keyFieldCount, value); + sortBuffer.write(combinedRow); + offset++; + } + batch.releaseBatch(); + } + } + } + + MutableObjectIterator sortedIterator = sortBuffer.sortedIterator(); + BinaryRow binaryRow = new BinaryRow(combinedType.getFieldCount()); + FieldGetter valueGetter = + InternalRow.createFieldGetter( + new VarBinaryType(Integer.MAX_VALUE), keyFieldCount); + + Iterator> entryIterator = + new Iterator>() { + private BinaryRow current = binaryRow; + private boolean hasNext; + + { + advance(); + } + + private void advance() { + try { + current = sortedIterator.next(current); + hasNext = current != null; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public boolean hasNext() { + return hasNext; + } + + @Override + public Map.Entry next() { + byte[] key = keySerializer.serializeToBytes(current); + byte[] value = (byte[]) valueGetter.getFieldOrNull(current); + advance(); + return new AbstractMap.SimpleImmutableEntry<>(key, value); + } + }; + + kvDb.bulkLoad(entryIterator); + } finally { + sortBuffer.clear(); + } + } + + @Nullable + public Long lookupRowId(byte[] keyBytes) throws IOException { + byte[] value = kvDb.get(keyBytes); + if (value == null) { + return null; + } + return VarLengthIntUtils.decodeLong(value, 0); + } + + public long lookupFirstRowId(long rowId) { + return firstRowIdLookup.lookup(rowId); + } + + /** + * Incrementally sync the index from new snapshots. Scans the latest snapshot for newly added + * files (those whose firstRowId is not yet tracked in {@link #firstIdToFiles}), reads their + * (key, _ROW_ID) pairs, and inserts them into the kvDb. + */ + public void incrementalSync( + FileStoreTable table, BinaryRow partition, long oldSnapshotId, long newSnapshotId) + throws Exception { + List entries = scanPartitionFiles(table, partition, newSnapshotId); + + Map> newFirstIdToFiles = new HashMap<>(); + for (ManifestEntry entry : entries) { + long firstRowId = entry.file().nonNullFirstRowId(); + newFirstIdToFiles.computeIfAbsent(firstRowId, k -> new ArrayList<>()).add(entry.file()); + } + + int keyFieldCount = upsertKeyType.getFieldCount(); + InnerTableRead tableRead = getOrCreateTableRead(table); + + for (Map.Entry> fileGroup : newFirstIdToFiles.entrySet()) { + long firstRowId = fileGroup.getKey(); + if (firstIdToFiles.containsKey(firstRowId)) { + continue; + } + + List files = fileGroup.getValue(); + DataSplit split = + DataSplit.builder() + .withPartition(partition) + .withBucket(0) + .withDataFiles(files) + .withBucketPath( + table.store().pathFactory().bucketPath(partition, 0).toString()) + .rawConvertible(false) + .build(); + + long offset = 0; + try (RecordReader reader = tableRead.createReader(split)) { + RecordReader.RecordIterator batch; + while ((batch = reader.readBatch()) != null) { + InternalRow row; + while ((row = batch.next()) != null) { + long rowId = firstRowId + offset; + GenericRow keyRow = new GenericRow(keyFieldCount); + for (int i = 0; i < keyFieldCount; i++) { + keyRow.setField(i, keyFieldGetters[i].getFieldOrNull(row)); + } + byte[] key = keySerializer.serializeToBytes(keyRow); + kvDb.put(key, encodeRowId(rowId)); + offset++; + } + batch.releaseBatch(); + } + } + } + + buildFileMetadata(entries); + } + + @Override + public void close() throws IOException { + kvDb.close(); + cachedTableRead = null; + } + + private InnerTableRead getOrCreateTableRead(FileStoreTable table) { + if (cachedTableRead == null) { + List upsertKeyNames = + upsertKeyType.getFields().stream() + .map(DataField::name) + .collect(Collectors.toList()); + RowType readType = buildReadType(table.rowType(), upsertKeyNames); + cachedTableRead = table.newRead().withReadType(readType); + } + return cachedTableRead; + } + + private void buildFileMetadata(List entries) { + TreeSet rowIdSet = new TreeSet<>(); + firstIdToFiles.clear(); + for (ManifestEntry entry : entries) { + DataFileMeta fileMeta = entry.file(); + long firstRowId = fileMeta.nonNullFirstRowId(); + rowIdSet.add(firstRowId); + firstIdToFiles.computeIfAbsent(firstRowId, k -> new ArrayList<>()).add(fileMeta); + } + this.firstRowIdLookup = new FirstRowIdLookup(new ArrayList<>(rowIdSet)); + } + + private static RowType buildReadType(RowType tableRowType, List upsertKeyNames) { + RowType withRowId = SpecialFields.rowTypeWithRowId(tableRowType); + List projection = new ArrayList<>(upsertKeyNames); + projection.add(SpecialFields.ROW_ID.name()); + return withRowId.project(projection); + } + + private static byte[] encodeRowId(long rowId) { + byte[] buf = new byte[10]; + int len = VarLengthIntUtils.encodeLong(buf, rowId); + byte[] result = new byte[len]; + System.arraycopy(buf, 0, result, 0, len); + return result; + } + + private static List scanPartitionFiles( + FileStoreTable table, BinaryRow partition, long snapshotId) { + List allEntries = + table.store() + .newScan() + .withManifestEntryFilter( + entry -> + entry.file().firstRowId() != null + && !isBlobFile(entry.file().fileName()) + && !isVectorStoreFile(entry.file().fileName())) + .withSnapshot(snapshotId) + .plan() + .files(); + List result = new ArrayList<>(); + for (ManifestEntry entry : allEntries) { + if (entry.partition().equals(partition)) { + result.add(entry); + } + } + return result; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecord.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecord.java new file mode 100644 index 000000000000..862d5dbdc3b8 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecord.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.InternalRow; + +/** + * A tagged record carrying classification metadata through the network shuffle between {@link + * UpsertClassifyOperator} (Phase 1) and {@link UpsertWriteOperator} (Phase 2). + */ +public class UpsertRecord { + + private final BinaryRow partition; + private final long firstRowId; + private final long offset; + private final InternalRow row; + + public UpsertRecord(BinaryRow partition, long firstRowId, long offset, InternalRow row) { + this.partition = partition; + this.firstRowId = firstRowId; + this.offset = offset; + this.row = row; + } + + public BinaryRow partition() { + return partition; + } + + public long firstRowId() { + return firstRowId; + } + + public long offset() { + return offset; + } + + public InternalRow row() { + return row; + } + + public boolean isInsert() { + return firstRowId < 0; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordChannelComputer.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordChannelComputer.java new file mode 100644 index 000000000000..5c99d9807e76 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordChannelComputer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.table.sink.ChannelComputer; +import org.apache.paimon.utils.MurmurHashUtils; + +/** + * A {@link ChannelComputer} that routes {@link UpsertRecord}s for the Phase 2 shuffle. UPDATE + * records are routed by {@code firstRowId} to ensure single-writer-per-file. INSERT records are + * round-robin distributed to avoid data skew. + */ +public class UpsertRecordChannelComputer implements ChannelComputer { + + private static final long serialVersionUID = 1L; + + private transient int numChannels; + private transient int insertCounter; + + @Override + public void setup(int numChannels) { + this.numChannels = numChannels; + this.insertCounter = 0; + } + + @Override + public int channel(UpsertRecord record) { + if (record.isInsert()) { + return (insertCounter++ & Integer.MAX_VALUE) % numChannels; + } + long hash = MurmurHashUtils.fmix(record.firstRowId()); + return (int) (hash % numChannels + numChannels) % numChannels; + } + + @Override + public String toString() { + return "shuffle by firstRowId/partition for upsert"; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordSerializer.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordSerializer.java new file mode 100644 index 000000000000..70763c2ef64f --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordSerializer.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.serializer.InternalRowSerializer; +import org.apache.paimon.memory.MemorySegment; +import org.apache.paimon.types.RowType; + +import org.apache.flink.core.io.SimpleVersionedSerializer; + +import java.io.IOException; +import java.nio.ByteBuffer; + +/** {@link SimpleVersionedSerializer} for {@link UpsertRecord}. */ +public class UpsertRecordSerializer implements SimpleVersionedSerializer { + + private final InternalRowSerializer rowSerializer; + private final int partitionArity; + + public UpsertRecordSerializer(RowType rowType, int partitionArity) { + this.rowSerializer = new InternalRowSerializer(rowType); + this.partitionArity = partitionArity; + } + + @Override + public int getVersion() { + return 1; + } + + @Override + public byte[] serialize(UpsertRecord record) throws IOException { + BinaryRow partition = record.partition(); + byte[] partitionBytes = partition.getFieldCount() == 0 ? new byte[0] : partition.toBytes(); + + BinaryRow binaryRow = rowSerializer.toBinaryRow(record.row()); + byte[] rowBytes = binaryRow.toBytes(); + + ByteBuffer buffer = + ByteBuffer.allocate(8 + 8 + 4 + partitionBytes.length + 4 + rowBytes.length); + buffer.putLong(record.firstRowId()); + buffer.putLong(record.offset()); + buffer.putInt(partitionBytes.length); + buffer.put(partitionBytes); + buffer.putInt(rowBytes.length); + buffer.put(rowBytes); + return buffer.array(); + } + + @Override + public UpsertRecord deserialize(int version, byte[] bytes) throws IOException { + if (version != getVersion()) { + throw new RuntimeException("Cannot deserialize version: " + version); + } + + ByteBuffer buffer = ByteBuffer.wrap(bytes); + long firstRowId = buffer.getLong(); + long offset = buffer.getLong(); + + int partitionLen = buffer.getInt(); + BinaryRow partition; + if (partitionLen == 0) { + partition = BinaryRow.EMPTY_ROW; + } else { + byte[] partitionBytes = new byte[partitionLen]; + buffer.get(partitionBytes); + partition = new BinaryRow(partitionArity); + partition.pointTo(MemorySegment.wrap(partitionBytes), 0, partitionLen); + } + + int rowLen = buffer.getInt(); + byte[] rowBytes = new byte[rowLen]; + buffer.get(rowBytes); + BinaryRow row = new BinaryRow(rowSerializer.getArity()); + row.pointTo(MemorySegment.wrap(rowBytes), 0, rowLen); + + return new UpsertRecord(partition, firstRowId, offset, row); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordTypeInfo.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordTypeInfo.java new file mode 100644 index 000000000000..dfbbf67635ca --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertRecordTypeInfo.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.flink.sink.NoneCopyVersionedSerializerTypeSerializerProxy; +import org.apache.paimon.types.RowType; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.serialization.SerializerConfig; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.TypeSerializer; + +/** Type information of {@link UpsertRecord}. */ +public class UpsertRecordTypeInfo extends TypeInformation { + + private final RowType rowType; + private final int partitionArity; + + public UpsertRecordTypeInfo(RowType rowType, int partitionArity) { + this.rowType = rowType; + this.partitionArity = partitionArity; + } + + @Override + public boolean isBasicType() { + return false; + } + + @Override + public boolean isTupleType() { + return false; + } + + @Override + public int getArity() { + return 1; + } + + @Override + public int getTotalFields() { + return 1; + } + + @Override + public Class getTypeClass() { + return UpsertRecord.class; + } + + @Override + public boolean isKeyType() { + return false; + } + + /** + * Do not annotate with @override here to maintain compatibility with Flink 1.18-. + */ + public TypeSerializer createSerializer(SerializerConfig config) { + return this.createSerializer((ExecutionConfig) null); + } + + /** + * Do not annotate with @override here to maintain compatibility with Flink 2.0+. + */ + public TypeSerializer createSerializer(ExecutionConfig config) { + return new NoneCopyVersionedSerializerTypeSerializerProxy( + () -> new UpsertRecordSerializer(rowType, partitionArity)) {}; + } + + @Override + public int hashCode() { + return rowType.hashCode(); + } + + @Override + public boolean canEqual(Object obj) { + return obj instanceof UpsertRecordTypeInfo; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof UpsertRecordTypeInfo)) { + return false; + } + UpsertRecordTypeInfo other = (UpsertRecordTypeInfo) obj; + return rowType.equals(other.rowType) && partitionArity == other.partitionArity; + } + + @Override + public String toString() { + return "UpsertRecordTypeInfo"; + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertWriteOperator.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertWriteOperator.java new file mode 100644 index 000000000000..54291e9f747b --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/UpsertWriteOperator.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.data.BinaryRow; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.flink.sink.Committable; +import org.apache.paimon.flink.sink.PrepareCommitOperator; +import org.apache.paimon.io.CompactIncrement; +import org.apache.paimon.io.DataFileMeta; +import org.apache.paimon.io.DataIncrement; +import org.apache.paimon.manifest.ManifestEntry; +import org.apache.paimon.operation.AbstractFileStoreWrite; +import org.apache.paimon.options.Options; +import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.SpecialFields; +import org.apache.paimon.table.sink.CommitMessage; +import org.apache.paimon.table.sink.CommitMessageImpl; +import org.apache.paimon.table.sink.TableWriteImpl; +import org.apache.paimon.table.source.DataSplit; +import org.apache.paimon.table.source.InnerTableRead; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.CloseableIterator; +import org.apache.paimon.utils.CommitIncrement; +import org.apache.paimon.utils.Preconditions; +import org.apache.paimon.utils.ProjectedRow; +import org.apache.paimon.utils.RecordWriter; +import org.apache.paimon.utils.SnapshotManager; + +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import static org.apache.paimon.format.blob.BlobFileFormat.isBlobFile; +import static org.apache.paimon.types.VectorType.isVectorStoreFile; + +/** + * Phase 2 operator for data evolution streaming upsert. Receives tagged {@link UpsertRecord}s from + * {@link UpsertClassifyOperator} after the firstRowId-based shuffle. Performs partial writes for + * UPDATE records and normal appends for INSERT records. + * + *

The shuffle guarantees that all updates targeting the same file (same firstRowId) arrive at + * the same subtask, eliminating concurrent-write conflicts. + */ +public class UpsertWriteOperator extends PrepareCommitOperator { + + private final FileStoreTable table; + + private transient List buffered; + private transient AbstractFileStoreWrite tableWrite; + private transient InnerTableRead tableRead; + private transient ProjectedRow projectedRow; + private transient SnapshotManager snapshotManager; + private transient Map> firstIdToFiles; + private transient RowType fullWriteType; + private transient RowType readType; + private transient InternalRow.FieldGetter[] fieldGetters; + + public UpsertWriteOperator( + StreamOperatorParameters parameters, FileStoreTable table) { + super(parameters, Options.fromMap(table.options())); + this.table = + table.copy(Collections.singletonMap(CoreOptions.TARGET_FILE_SIZE.key(), "99999 G")); + } + + @Override + public void open() throws Exception { + super.open(); + + this.fullWriteType = table.rowType(); + this.buffered = new ArrayList<>(); + + this.snapshotManager = table.store().snapshotManager(); + + this.readType = SpecialFields.rowTypeWithRowId(fullWriteType); + this.tableRead = table.newRead().withReadType(readType); + this.projectedRow = ProjectedRow.from(fullWriteType, readType); + + int colCount = fullWriteType.getFieldCount(); + this.fieldGetters = new InternalRow.FieldGetter[colCount]; + for (int i = 0; i < colCount; i++) { + fieldGetters[i] = InternalRow.createFieldGetter(fullWriteType.getTypeAt(i), i); + } + + @SuppressWarnings({"unchecked", "resource"}) + TableWriteImpl writeImpl = + (TableWriteImpl) + table.newBatchWriteBuilder().newWrite().withWriteType(fullWriteType); + this.tableWrite = (AbstractFileStoreWrite) writeImpl.getWrite(); + + this.firstIdToFiles = new HashMap<>(); + refreshFileMetadata(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + buffered.add(element.getValue()); + } + + @Override + protected List prepareCommit(boolean waitCompaction, long checkpointId) + throws IOException { + try { + refreshFileMetadata(); + + Map>> updatesByPartition = + new HashMap<>(); + Map> insertsByPartition = new HashMap<>(); + + for (UpsertRecord record : buffered) { + if (record.isInsert()) { + insertsByPartition + .computeIfAbsent(record.partition(), k -> new ArrayList<>()) + .add(record.row()); + } else { + updatesByPartition + .computeIfAbsent(record.partition(), k -> new TreeMap<>()) + .computeIfAbsent(record.firstRowId(), k -> new TreeMap<>()) + .put(record.offset(), record.row()); + } + } + buffered.clear(); + + List committables = new ArrayList<>(); + + for (Map.Entry>> partEntry : + updatesByPartition.entrySet()) { + BinaryRow partition = partEntry.getKey(); + for (Map.Entry> fileEntry : + partEntry.getValue().entrySet()) { + CommitMessage msg = + writePartialUpdate(partition, fileEntry.getKey(), fileEntry.getValue()); + committables.add(new Committable(checkpointId, msg)); + } + } + + for (Map.Entry> partEntry : + insertsByPartition.entrySet()) { + BinaryRow partition = partEntry.getKey(); + List rows = partEntry.getValue(); + + RecordWriter writer = tableWrite.createWriter(partition, 0); + try { + for (InternalRow row : rows) { + writer.write(row); + } + CommitIncrement increment = writer.prepareCommit(false); + List newFiles = increment.newFilesIncrement().newFiles(); + CommitMessage msg = + new CommitMessageImpl( + partition, + 0, + null, + new DataIncrement( + newFiles, + Collections.emptyList(), + Collections.emptyList()), + CompactIncrement.emptyIncrement()); + committables.add(new Committable(checkpointId, msg)); + } finally { + writer.close(); + } + } + + return committables; + } catch (Exception e) { + throw new IOException("Error in prepareCommit", e); + } + } + + @Override + public void close() throws Exception { + super.close(); + if (tableWrite != null) { + tableWrite.close(); + } + } + + private void refreshFileMetadata() { + Long latestId = snapshotManager.latestSnapshotId(); + if (latestId == null) { + return; + } + + List allEntries = + table.store() + .newScan() + .withManifestEntryFilter( + entry -> + entry.file().firstRowId() != null + && !isBlobFile(entry.file().fileName()) + && !isVectorStoreFile(entry.file().fileName())) + .withSnapshot(latestId) + .plan() + .files(); + + firstIdToFiles.clear(); + for (ManifestEntry entry : allEntries) { + long firstRowId = entry.file().nonNullFirstRowId(); + firstIdToFiles.computeIfAbsent(firstRowId, k -> new ArrayList<>()).add(entry.file()); + } + } + + private CommitMessage writePartialUpdate( + BinaryRow partition, long firstRowId, TreeMap updates) + throws Exception { + List oldFiles = + firstIdToFiles.getOrDefault(firstRowId, Collections.emptyList()); + Preconditions.checkState( + !oldFiles.isEmpty(), + String.format("Cannot find files for firstRowId: %s", firstRowId)); + + long rowCount = oldFiles.get(0).rowCount(); + + DataSplit dataSplit = + DataSplit.builder() + .withPartition(partition) + .withBucket(0) + .withDataFiles(oldFiles) + .withBucketPath( + table.store().pathFactory().bucketPath(partition, 0).toString()) + .rawConvertible(false) + .build(); + + int[] nonNullCols = computeAnyNonNullColumns(updates); + boolean isPartialColumn = nonNullCols.length < fullWriteType.getFieldCount(); + + ProjectedRow writeProjection; + if (isPartialColumn) { + RowType partialWriteType = fullWriteType.project(nonNullCols); + tableWrite.withWriteType(partialWriteType); + writeProjection = ProjectedRow.from(partialWriteType, readType); + } else { + writeProjection = projectedRow; + } + + RecordWriter writer = tableWrite.createWriter(partition, 0); + try { + //noinspection resource + try (CloseableIterator reader = + tableRead.createReader(dataSplit).toCloseableIterator()) { + long offset = 0; + while (reader.hasNext()) { + InternalRow originalRow = reader.next(); + InternalRow updateRow = updates.get(offset); + if (updateRow != null) { + writeProjection.replaceRow(mergeUpdateWithOriginal(updateRow, originalRow)); + } else { + writeProjection.replaceRow(originalRow); + } + writer.write(writeProjection); + offset++; + } + + Preconditions.checkState( + offset == rowCount, + String.format( + "Written num %s not equal to original row num %s", + offset, rowCount)); + } + + CommitIncrement written = writer.prepareCommit(false); + List newFiles = written.newFilesIncrement().newFiles(); + Preconditions.checkState( + newFiles.size() == 1, "Partial update should produce exactly one file"); + DataFileMeta newFile = newFiles.get(0).assignFirstRowId(firstRowId); + + List deletedFiles = isPartialColumn ? Collections.emptyList() : oldFiles; + return new CommitMessageImpl( + partition, + 0, + null, + new DataIncrement( + Collections.singletonList(newFile), + deletedFiles, + Collections.emptyList()), + CompactIncrement.emptyIncrement()); + } finally { + writer.close(); + if (isPartialColumn) { + tableWrite.withWriteType(fullWriteType); + } + } + } + + private int[] computeAnyNonNullColumns(TreeMap updates) { + int colCount = fullWriteType.getFieldCount(); + boolean[] anyNonNull = new boolean[colCount]; + for (InternalRow row : updates.values()) { + for (int i = 0; i < colCount; i++) { + if (!row.isNullAt(i)) { + anyNonNull[i] = true; + } + } + } + int count = 0; + for (boolean b : anyNonNull) { + if (b) { + count++; + } + } + int[] result = new int[count]; + int idx = 0; + for (int i = 0; i < colCount; i++) { + if (anyNonNull[i]) { + result[idx++] = i; + } + } + return result; + } + + private InternalRow mergeUpdateWithOriginal(InternalRow updateRow, InternalRow originalRow) { + int colCount = fullWriteType.getFieldCount(); + boolean needsMerge = false; + for (int i = 0; i < colCount; i++) { + if (updateRow.isNullAt(i)) { + needsMerge = true; + break; + } + } + if (!needsMerge) { + return updateRow; + } + GenericRow merged = new GenericRow(colCount); + for (int i = 0; i < colCount; i++) { + if (!updateRow.isNullAt(i)) { + merged.setField(i, fieldGetters[i].getFieldOrNull(updateRow)); + } else { + merged.setField(i, fieldGetters[i].getFieldOrNull(originalRow)); + } + } + return merged; + } + + /** Factory for creating {@link UpsertWriteOperator}. */ + public static class Factory extends PrepareCommitOperator.Factory { + + private final FileStoreTable table; + + public Factory(FileStoreTable table) { + super(Options.fromMap(table.options())); + this.table = table; + } + + @Override + @SuppressWarnings("unchecked") + public > T createStreamOperator( + StreamOperatorParameters parameters) { + return (T) new UpsertWriteOperator(parameters, table); + } + + @SuppressWarnings("rawtypes") + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return UpsertWriteOperator.class; + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/DataEvolutionUpsertSink.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/DataEvolutionUpsertSink.java new file mode 100644 index 000000000000..4708150a1356 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/DataEvolutionUpsertSink.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.flink.dataevolution.UpsertClassifyOperator; +import org.apache.paimon.flink.dataevolution.UpsertRecord; +import org.apache.paimon.flink.dataevolution.UpsertRecordChannelComputer; +import org.apache.paimon.flink.dataevolution.UpsertRecordTypeInfo; +import org.apache.paimon.flink.dataevolution.UpsertWriteOperator; +import org.apache.paimon.manifest.ManifestCommittable; +import org.apache.paimon.table.FileStoreTable; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; + +import javax.annotation.Nullable; + +import java.util.List; +import java.util.Map; + +import static org.apache.paimon.flink.utils.ParallelismUtils.forwardParallelism; + +/** + * A {@link FlinkWriteSink} for data evolution streaming upsert. Uses a two-phase pipeline: + * + *

    + *
  1. Phase 1 ({@link UpsertClassifyOperator}): classifies records as INSERT or UPDATE using a + * business-key index + *
  2. Network shuffle by firstRowId to ensure single-writer-per-file + *
  3. Phase 2 ({@link UpsertWriteOperator}): performs partial writes for updates and appends for + * inserts + *
+ */ +public class DataEvolutionUpsertSink extends FlinkWriteSink { + + private static final long serialVersionUID = 1L; + + private final List upsertKeyColumns; + + public DataEvolutionUpsertSink( + FileStoreTable table, + @Nullable Map overwritePartition, + List upsertKeyColumns) { + super(table, overwritePartition); + this.upsertKeyColumns = upsertKeyColumns; + } + + @Override + public DataStreamSink sinkFrom(DataStream input, String initialCommitUser) { + // Phase 1: classify each record as INSERT or UPDATE + SingleOutputStreamOperator classified = + input.transform( + "Upsert Classify : " + table.name(), + new UpsertRecordTypeInfo(table.rowType(), table.partitionKeys().size()), + new UpsertClassifyOperator.Factory(table, upsertKeyColumns)); + forwardParallelism(classified, input); + + // Shuffle by firstRowId to guarantee single-writer-per-file + DataStream shuffled = + FlinkStreamPartitioner.partition( + classified, new UpsertRecordChannelComputer(), null); + + // Phase 2: write (partial updates + inserts) + SingleOutputStreamOperator written = + shuffled.transform( + "Upsert Write : " + table.name(), + new CommittableTypeInfo(), + new UpsertWriteOperator.Factory(table)); + forwardParallelism(written, shuffled); + + return doCommit(written, initialCommitUser); + } + + @Override + protected OneInputStreamOperatorFactory createWriteOperatorFactory( + StoreSinkWrite.Provider writeProvider, String commitUser) { + throw new UnsupportedOperationException( + "DataEvolutionUpsertSink overrides sinkFrom directly"); + } + + @Override + protected CommittableStateManager createCommittableStateManager() { + return createRestoreOnlyCommittableStateManager(table); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java index ae8013b7e709..f97607404eee 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/FlinkSinkBuilder.java @@ -339,6 +339,26 @@ private DataStreamSink buildUnawareBucketSink(DataStream input) table.primaryKeys().isEmpty(), "Unaware bucket mode only works with append-only table for now."); + List upsertKeys = table.coreOptions().dataEvolutionUpsertKeyColumns(); + if (!upsertKeys.isEmpty()) { + checkArgument( + table.coreOptions().dataEvolutionEnabled(), + "data-evolution.upsert-keys requires data-evolution.enabled = true."); + checkArgument( + table.coreOptions().rowTrackingEnabled(), + "data-evolution.upsert-keys requires row-tracking.enabled = true."); + DataStream keyed = + partition( + input, + new UpsertKeyChannelComputer( + table.schema(), + upsertKeys, + table.coreOptions().dataEvolutionUpsertIndexParallelism()), + parallelism); + return new DataEvolutionUpsertSink(table, overwritePartition, upsertKeys) + .sinkFrom(keyed); + } + if (!table.partitionKeys().isEmpty()) { PartitionSinkStrategy strategy = table.coreOptions().partitionSinkStrategy(); if (strategy == PartitionSinkStrategy.HASH) { diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/UpsertKeyChannelComputer.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/UpsertKeyChannelComputer.java new file mode 100644 index 000000000000..0bde77c14ba0 --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/sink/UpsertKeyChannelComputer.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.sink; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalRow.FieldGetter; +import org.apache.paimon.schema.TableSchema; +import org.apache.paimon.table.sink.ChannelComputer; +import org.apache.paimon.types.RowType; + +import java.util.List; + +/** + * A {@link ChannelComputer} that partitions records by a compound hash of partition fields and + * upsert key columns. Uses {@code indexParallelism} to control how many subtasks each partition can + * be distributed to: + * + *
+ * slot = abs(partitionHash) * indexParallelism + abs(keyHash) % indexParallelism
+ * channel = slot % numChannels
+ * 
+ * + *

With {@code indexParallelism=1}, each partition maps to exactly one subtask, so the upsert key + * index is loaded only once per partition. Higher values spread the load across more subtasks at + * the cost of redundant index loading. + */ +public class UpsertKeyChannelComputer implements ChannelComputer { + + private static final long serialVersionUID = 1L; + + private final TableSchema schema; + private final List upsertKeyColumns; + private final int indexParallelism; + + private transient int numChannels; + private transient FieldGetter[] keyFieldGetters; + private transient FieldGetter[] partitionFieldGetters; + + public UpsertKeyChannelComputer( + TableSchema schema, List upsertKeyColumns, int indexParallelism) { + this.schema = schema; + this.upsertKeyColumns = upsertKeyColumns; + this.indexParallelism = indexParallelism; + } + + @Override + public void setup(int numChannels) { + this.numChannels = numChannels; + RowType rowType = schema.logicalRowType(); + + this.keyFieldGetters = new FieldGetter[upsertKeyColumns.size()]; + for (int i = 0; i < upsertKeyColumns.size(); i++) { + int idx = rowType.getFieldIndex(upsertKeyColumns.get(i)); + keyFieldGetters[i] = InternalRow.createFieldGetter(rowType.getTypeAt(idx), idx); + } + + List partitionKeys = schema.partitionKeys(); + this.partitionFieldGetters = new FieldGetter[partitionKeys.size()]; + for (int i = 0; i < partitionKeys.size(); i++) { + int idx = rowType.getFieldIndex(partitionKeys.get(i)); + partitionFieldGetters[i] = InternalRow.createFieldGetter(rowType.getTypeAt(idx), idx); + } + } + + @Override + public int channel(InternalRow record) { + int partHash = 0; + for (FieldGetter getter : partitionFieldGetters) { + Object val = getter.getFieldOrNull(record); + partHash = partHash * 31 + (val == null ? 0 : val.hashCode()); + } + + int keyHash = 0; + for (FieldGetter getter : keyFieldGetters) { + Object val = getter.getFieldOrNull(record); + keyHash = keyHash * 31 + (val == null ? 0 : val.hashCode()); + } + + long slot = + (long) (partHash & Integer.MAX_VALUE) * indexParallelism + + (keyHash & Integer.MAX_VALUE) % indexParallelism; + return (int) (slot % numChannels); + } + + @Override + public String toString() { + return "shuffle by partition+upsert key " + + upsertKeyColumns + + " (indexParallelism=" + + indexParallelism + + ")"; + } +} diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/dataevolution/DataEvolutionUpsertITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/dataevolution/DataEvolutionUpsertITCase.java new file mode 100644 index 000000000000..d0253efcf50c --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/dataevolution/DataEvolutionUpsertITCase.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.dataevolution; + +import org.apache.paimon.flink.action.ActionITCaseBase; + +import org.apache.flink.types.Row; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.paimon.CoreOptions.DATA_EVOLUTION_ENABLED; +import static org.apache.paimon.CoreOptions.ROW_TRACKING_ENABLED; +import static org.apache.paimon.flink.util.ReadWriteTableTestUtil.bEnv; +import static org.apache.paimon.flink.util.ReadWriteTableTestUtil.buildDdl; +import static org.apache.paimon.flink.util.ReadWriteTableTestUtil.init; +import static org.apache.paimon.flink.util.ReadWriteTableTestUtil.sEnv; +import static org.apache.paimon.flink.util.ReadWriteTableTestUtil.testBatchRead; + +/** ITCase for data evolution streaming upsert via {@code data-evolution.upsert-keys}. */ +public class DataEvolutionUpsertITCase extends ActionITCaseBase { + + @BeforeEach + public void setup() throws Exception { + init(warehouse); + } + + @Test + public void testBasicUpsert() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + upsert("T", "id", "(1, 'a_new', 10.0)", "(3, 'c_new', 30.0)"); + + List expected = + Arrays.asList( + Row.of(1, "a_new", 10.0), Row.of(2, "b", 2.0), Row.of(3, "c_new", 30.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testInsertOnly() throws Exception { + createTable("T", false); + + upsert("T", "id", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + List expected = + Arrays.asList(Row.of(1, "a", 1.0), Row.of(2, "b", 2.0), Row.of(3, "c", 3.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testMixedInsertAndUpdate() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + upsert( + "T", + "id", + "(2, 'b_new', 20.0)", + "(3, 'c_new', 30.0)", + "(4, 'd', 4.0)", + "(5, 'e', 5.0)"); + + List expected = + Arrays.asList( + Row.of(1, "a", 1.0), + Row.of(2, "b_new", 20.0), + Row.of(3, "c_new", 30.0), + Row.of(4, "d", 4.0), + Row.of(5, "e", 5.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testMultipleUpserts() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + upsert("T", "id", "(1, 'a2', 10.0)", "(4, 'd', 4.0)"); + + upsert("T", "id", "(2, 'b2', 20.0)", "(4, 'd2', 40.0)"); + + List expected = + Arrays.asList( + Row.of(1, "a2", 10.0), + Row.of(2, "b2", 20.0), + Row.of(3, "c", 3.0), + Row.of(4, "d2", 40.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testCompositeKey() throws Exception { + Map options = baseOptions(); + sEnv.executeSql( + buildDdl( + "T", + Arrays.asList("id INT", "sub_id INT", "name STRING"), + Collections.emptyList(), + Collections.emptyList(), + options)); + + bEnv.executeSql("INSERT INTO T VALUES (1, 1, 'a'), (1, 2, 'b'), (2, 1, 'c')").await(); + + sEnv.executeSql( + "INSERT INTO T /*+ OPTIONS('data-evolution.upsert-keys'='id,sub_id') */ " + + "VALUES (1, 2, 'b_new'), (2, 1, 'c_new'), (3, 1, 'd')") + .await(); + + List expected = + Arrays.asList( + Row.of(1, 1, "a"), + Row.of(1, 2, "b_new"), + Row.of(2, 1, "c_new"), + Row.of(3, 1, "d")); + testBatchRead("SELECT * FROM T ORDER BY id, sub_id", expected); + } + + @Test + public void testPartialColumnUpdate() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + // Update only 'name' column; 'value' is NULL meaning "don't change" + upsert( + "T", + "id", + "(1, 'a_new', CAST(NULL AS DOUBLE))", + "(3, 'c_new', CAST(NULL AS DOUBLE))"); + + List expected = + Arrays.asList( + Row.of(1, "a_new", 1.0), Row.of(2, "b", 2.0), Row.of(3, "c_new", 3.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testPartialColumnUpdateValueOnly() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)", "(3, 'c', 3.0)"); + + // Update only 'value' column; 'name' is NULL meaning "don't change" + upsert("T", "id", "(1, CAST(NULL AS STRING), 10.0)", "(2, CAST(NULL AS STRING), 20.0)"); + + List expected = + Arrays.asList(Row.of(1, "a", 10.0), Row.of(2, "b", 20.0), Row.of(3, "c", 3.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testPartialColumnUpdateThenFullUpdate() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)"); + + // First: partial update (only name) + upsert("T", "id", "(1, 'a_v2', CAST(NULL AS DOUBLE))"); + + // Second: full update (all columns non-NULL) + upsert("T", "id", "(2, 'b_v2', 22.0)"); + + List expected = Arrays.asList(Row.of(1, "a_v2", 1.0), Row.of(2, "b_v2", 22.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testPartialColumnUpdateWithInsert() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)"); + + // Mix partial update and new insert in the same upsert + upsert("T", "id", "(1, 'a_new', CAST(NULL AS DOUBLE))", "(3, 'new_row', 3.0)"); + + List expected = + Arrays.asList( + Row.of(1, "a_new", 1.0), Row.of(2, "b", 2.0), Row.of(3, "new_row", 3.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testMultiplePartialUpdates() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)"); + + // First partial update: only name + upsert("T", "id", "(1, 'a_v2', CAST(NULL AS DOUBLE))"); + + // Second partial update: only value + upsert("T", "id", "(1, CAST(NULL AS STRING), 100.0)"); + + List expected = Arrays.asList(Row.of(1, "a_v2", 100.0), Row.of(2, "b", 2.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + @Test + public void testMixedPartialColumnUpdatesInSameBatch() throws Exception { + createTable("T", false); + batchInsert("T", "(1, 'a', 1.0)", "(2, 'b', 2.0)"); + + // Row 1 updates 'name' only, row 2 updates 'value' only — different columns in same batch + upsert("T", "id", "(1, 'a_v2', CAST(NULL AS DOUBLE))", "(2, CAST(NULL AS STRING), 20.0)"); + + List expected = Arrays.asList(Row.of(1, "a_v2", 1.0), Row.of(2, "b", 20.0)); + testBatchRead("SELECT * FROM T ORDER BY id", expected); + } + + private void createTable(String tableName, boolean partitioned) { + List fields = Arrays.asList("id INT", "name STRING", "`value` DOUBLE"); + List partitionKeys = + partitioned ? Collections.singletonList("dt") : Collections.emptyList(); + if (partitioned) { + fields = Arrays.asList("id INT", "name STRING", "`value` DOUBLE", "dt STRING"); + } + sEnv.executeSql( + buildDdl(tableName, fields, Collections.emptyList(), partitionKeys, baseOptions())); + } + + private static Map baseOptions() { + Map options = new HashMap<>(); + options.put(ROW_TRACKING_ENABLED.key(), "true"); + options.put(DATA_EVOLUTION_ENABLED.key(), "true"); + options.put("bucket", "-1"); + return options; + } + + private void batchInsert(String tableName, String... records) throws Exception { + bEnv.executeSql( + String.format( + "INSERT INTO `%s` VALUES %s", tableName, String.join(",", records))) + .await(); + } + + private void upsert(String tableName, String upsertKeys, String... records) throws Exception { + sEnv.executeSql( + String.format( + "INSERT INTO `%s` /*+ OPTIONS('data-evolution.upsert-keys'='%s') */ VALUES %s", + tableName, upsertKeys, String.join(",", records))) + .await(); + } +}