-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(common): When inferring checkpoint/schema from timeline, check non-ingestion write commits (in case they have metadata rolled-over) #18576
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3f4c06e
2b34164
9cb152d
57b6ac2
ee3f2ab
a3e67f4
c7dfd44
1c8b339
71a919b
59bbf7a
4b28bed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -55,8 +55,10 @@ | |
| import org.apache.hudi.common.model.WriteConcurrencyMode; | ||
| import org.apache.hudi.common.model.WriteOperationType; | ||
| import org.apache.hudi.common.table.HoodieTableMetaClient; | ||
| import org.apache.hudi.common.table.TableSchemaResolver; | ||
| import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; | ||
| import org.apache.hudi.common.table.timeline.HoodieInstant; | ||
| import org.apache.hudi.common.table.timeline.HoodieTimeline; | ||
| import org.apache.hudi.common.table.timeline.InstantGenerator; | ||
| import org.apache.hudi.common.table.timeline.TimelineFactory; | ||
| import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; | ||
|
|
@@ -1992,6 +1994,78 @@ protected void autoCleanOnCommit() { | |
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testRollingMetadataPreservedAcrossClusteringAfterArchival() throws Exception { | ||
| String schemaKey = HoodieCommitMetadata.SCHEMA_KEY; | ||
| dataGen = new HoodieTestDataGenerator(new String[] {DEFAULT_FIRST_PARTITION_PATH}); | ||
|
|
||
| HoodieWriteConfig writeConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA) | ||
| .withCompactionConfig(HoodieCompactionConfig.newBuilder() | ||
| .compactionSmallFileSize(0).build()) | ||
| .withRollingMetadataKeys(schemaKey) | ||
| .withArchivalConfig(HoodieArchivalConfig.newBuilder() | ||
| .archiveCommitsWith(2, 3).build()) | ||
| .withCleanConfig(HoodieCleanConfig.newBuilder() | ||
| .withAutoClean(false).build()) | ||
| .build(); | ||
|
|
||
| SparkRDDWriteClient client = getHoodieWriteClient(writeConfig); | ||
|
|
||
| // Insert multiple batches to create file groups for clustering | ||
| for (int i = 0; i < 5; i++) { | ||
| insertCommitWithSchema(client, dataGen, 20, TRIP_EXAMPLE_SCHEMA); | ||
| } | ||
|
|
||
| HoodieWriteConfig clusterConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA) | ||
| .withClusteringConfig(createClusteringBuilder(true, 1).build()) | ||
| .withRollingMetadataKeys(schemaKey) | ||
| .withArchivalConfig(HoodieArchivalConfig.newBuilder() | ||
| .archiveCommitsWith(2, 3).build()) | ||
| .withCleanConfig(HoodieCleanConfig.newBuilder() | ||
| .withAutoClean(false).build()) | ||
| .build(); | ||
|
|
||
| for (int round = 0; round < 2; round++) { | ||
| SparkRDDWriteClient clusterWriter = getHoodieWriteClient(clusterConfig); | ||
| Option<String> clusteringInstant = clusterWriter.scheduleClustering(Option.empty()); | ||
| assertTrue(clusteringInstant.isPresent(), | ||
| "Clustering plan should be created (round " + round + ")"); | ||
| clusterWriter.cluster(clusteringInstant.get()); | ||
|
|
||
| // Only insert after the first round so that the second clustering instant | ||
| // remains on the active timeline after archival | ||
| if (round < 1) { | ||
| for (int i = 0; i < 3; i++) { | ||
| insertCommitWithSchema(client, dataGen, 20, TRIP_EXAMPLE_SCHEMA); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| client.archive(); | ||
|
|
||
| HoodieTableMetaClient freshMeta = HoodieTableMetaClient.reload(metaClient); | ||
| HoodieTimeline completedTimeline = freshMeta.getActiveTimeline() | ||
| .getCommitsTimeline().filterCompletedInstants(); | ||
|
|
||
| boolean foundSchemaInClustering = false; | ||
| for (HoodieInstant instant : completedTimeline.getInstants()) { | ||
| HoodieCommitMetadata metadata = completedTimeline.readCommitMetadata(instant); | ||
| if (metadata.getOperationType() == WriteOperationType.CLUSTER) { | ||
| String schema = metadata.getMetadata(schemaKey); | ||
| if (schema != null && !schema.isEmpty()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: could you use - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| foundSchemaInClustering = true; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| assertTrue(foundSchemaInClustering, | ||
| "Schema should be rolled over into clustering commits via rolling metadata"); | ||
|
kbuci marked this conversation as resolved.
|
||
|
|
||
| TableSchemaResolver resolver = new TableSchemaResolver(freshMeta); | ||
| assertTrue(resolver.getTableSchemaIfPresent(false).isPresent(), | ||
| "TableSchemaResolver should find schema even with clustering-only timeline"); | ||
| } | ||
|
|
||
| /** | ||
| * Disabling row writer here as clustering tests will throw the error below if it is used. | ||
| * java.util.concurrent.CompletionException: java.lang.ClassNotFoundException | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,13 +22,14 @@ | |
| import org.apache.hudi.common.model.HoodieCommitMetadata; | ||
| import org.apache.hudi.common.table.HoodieTableMetaClient; | ||
| import org.apache.hudi.common.table.checkpoint.CheckpointUtils; | ||
| import org.apache.hudi.common.util.StringUtils; | ||
| import org.apache.hudi.exception.HoodieException; | ||
| import org.apache.hudi.exception.HoodieIOException; | ||
| import org.apache.hudi.hadoop.fs.HadoopFSUtils; | ||
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * This is used to set a checkpoint from latest commit of another (mirror) hudi dataset. | ||
|
|
@@ -52,7 +53,11 @@ public void init(Configuration config) throws HoodieException { | |
|
|
||
| @Override | ||
| public String getCheckpoint() throws HoodieException { | ||
| return anotherDsHoodieMetaClient.getCommitsTimeline().filterCompletedInstants().getReverseOrderedInstants() | ||
| // Use getWriteTimeline() to include compaction/logcompaction in addition to | ||
| // commit/deltacommit/replacecommit, so checkpoint metadata rolled into any | ||
| // non-ingestion commit type is discoverable after archival. | ||
| return anotherDsHoodieMetaClient.getActiveTimeline().getWriteTimeline() | ||
| .filterCompletedInstants().getReverseOrderedInstants() | ||
| .map(instant -> { | ||
| try { | ||
| HoodieCommitMetadata commitMetadata = | ||
|
|
@@ -63,9 +68,11 @@ public String getCheckpoint() throws HoodieException { | |
| // No checkpoint found in this commit | ||
| return null; | ||
| } catch (IOException e) { | ||
| return null; | ||
| throw new HoodieIOException("Failed to read commit metadata for instant " + instant.requestedTime(), e); | ||
| } | ||
| }).filter(Objects::nonNull).findFirst() | ||
| // Filter out null (from HoodieException) and empty strings (from commits | ||
| // that don't have checkpoint metadata, e.g. when rollover is not configured) | ||
| }).filter(key -> !StringUtils.isNullOrEmpty(key)).findFirst() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: the lambda parameter - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| .orElseThrow(() -> new HoodieException("Unable to find checkpoint in source table at: " | ||
| + path + ". This table may not have been created with checkpoint tracking enabled.")); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 nit:
round < 1effectively meansround == 0— could you useround == 0directly? The< 1form makes a reader pause to ask whether a negative round value is ever possible.- AI-generated; verify before applying. React 👍/👎 to flag quality.