-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(utilities): add Spark/HoodieStreamer validators for pre-commit validation - Phase 3 #18405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5f86fb6
dd19d66
129e76e
5f810df
07f8ab8
5798d9f
34e50d6
97b99fd
2656b70
156a79d
5f4bca6
3f39bf3
17ece14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -84,9 +84,28 @@ public static void runValidators(HoodieWriteConfig config, | |
| Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table, afterState.schema()); | ||
|
|
||
| Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(",")) | ||
| .map(validatorClass -> ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass, | ||
| new Class<?>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, | ||
| table, context, config))); | ||
| .map(String::trim) | ||
| .filter(s -> !s.isEmpty()) | ||
| .flatMap(validatorClass -> { | ||
| try { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Unlike
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. Added .filter(s -> !s.isEmpty()) before the Class.forName call, matching the guard already present in SparkStreamerValidatorUtils. |
||
| Class<?> clazz = Class.forName(validatorClass); | ||
| if (!SparkPreCommitValidator.class.isAssignableFrom(clazz)) { | ||
| LOG.warn("Skipping validator {} — it does not implement SparkPreCommitValidator. " | ||
| + "If this is a streaming offset validator (e.g. SparkKafkaOffsetValidator), " | ||
| + "it will be invoked by SparkStreamerValidatorUtils instead.", validatorClass); | ||
| return Stream.empty(); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: after the - AI-generated; verify before applying. React 👍/👎 to flag quality.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
| SparkPreCommitValidator validator = (SparkPreCommitValidator) ReflectionUtils.loadClass( | ||
| validatorClass, | ||
| new Class<?>[] {HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class}, | ||
| table, context, config); | ||
| return Stream.of(validator); | ||
| } catch (ClassNotFoundException e) { | ||
| throw new HoodieValidationException("Cannot find validator class: " + validatorClass, e); | ||
| } catch (ReflectiveOperationException e) { | ||
| throw new HoodieValidationException("Failed to instantiate validator: " + validatorClass, e); | ||
| } | ||
| }); | ||
|
|
||
| boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join) | ||
| .reduce(true, Boolean::logicalAnd); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,6 +74,7 @@ | |
| import org.apache.hudi.config.HoodieErrorTableConfig; | ||
| import org.apache.hudi.config.HoodieIndexConfig; | ||
| import org.apache.hudi.config.HoodiePayloadConfig; | ||
| import org.apache.hudi.config.HoodiePreCommitValidatorConfig; | ||
| import org.apache.hudi.config.HoodieWriteConfig; | ||
| import org.apache.hudi.config.metrics.HoodieMetricsConfig; | ||
| import org.apache.hudi.data.HoodieJavaRDD; | ||
|
|
@@ -115,6 +116,7 @@ | |
| import org.apache.hudi.utilities.sources.InputBatch; | ||
| import org.apache.hudi.utilities.sources.Source; | ||
| import org.apache.hudi.utilities.streamer.HoodieStreamer.Config; | ||
| import org.apache.hudi.utilities.streamer.validator.SparkStreamerValidatorUtils; | ||
| import org.apache.hudi.utilities.transform.Transformer; | ||
|
|
||
| import com.codahale.metrics.Timer; | ||
|
|
@@ -128,6 +130,7 @@ | |
| import org.apache.spark.sql.Row; | ||
| import org.apache.spark.sql.SparkSession; | ||
| import org.apache.spark.sql.types.StructType; | ||
| import org.apache.spark.storage.StorageLevel; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
|
|
@@ -874,8 +877,38 @@ private Pair<Option<String>, JavaRDD<WriteStatus>> writeToSinkAndDoMetaSync(Hood | |
| totalSuccessfulRecords); | ||
| String commitActionType = CommitUtils.getCommitActionType(cfg.operation, HoodieTableType.valueOf(cfg.tableType)); | ||
|
|
||
| boolean success = writeClient.commit(instantTime, writeStatusRDD, Option.of(checkpointCommitMetadata), commitActionType, partitionToReplacedFileIds, Option.empty(), | ||
| Option.of(writeStatusValidator)); | ||
| // Cache the RDD only when pre-commit validators are configured. Validators collect the RDD | ||
| // before commit, so without caching the same DAG would re-evaluate inside writeClient.commit(). | ||
| // When no validators are configured, commit consumes the RDD once and caching adds no value. | ||
| // shouldUnpersist is true only when we created the cache here (validators present and storage | ||
| // level was NONE), so the finally block knows to release it. | ||
| boolean validatorsConfigured = !StringUtils.isNullOrEmpty(props.getString( | ||
| HoodiePreCommitValidatorConfig.VALIDATOR_CLASS_NAMES.key(), | ||
| HoodiePreCommitValidatorConfig.VALIDATOR_CLASS_NAMES.defaultValue())); | ||
| boolean shouldUnpersist = validatorsConfigured && writeStatusRDD.getStorageLevel().equals(StorageLevel.NONE()); | ||
|
codope marked this conversation as resolved.
|
||
| if (shouldUnpersist) { | ||
| writeStatusRDD.cache(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @danny0405! 1. Conditional cache — done in 5f4bca6. The cache/unpersist cycle is now guarded by a 2. Migrating
A Would you be OK if I file a follow-up issue + PR for that refactor so this PR can stay focused on the streaming-offset framework? Happy to take it on right after this lands.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed #18750 to track the |
||
| } | ||
| boolean success; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 If - AI-generated; verify before applying. React 👍/👎 to flag quality.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
| try { | ||
| if (validatorsConfigured) { | ||
| List<WriteStatus> writeStatuses = writeStatusRDD.collect(); | ||
|
|
||
| // Run pre-commit streaming offset validators (if configured). | ||
| // Placement before writeClient.commit() is intentional: offset validation is a stronger | ||
| // guard than commitOnErrors — if offset deviation indicates potential data loss, the commit | ||
| // must be prevented regardless of the commitOnErrors policy. | ||
| SparkStreamerValidatorUtils.runValidators(props, instantTime, writeStatuses, | ||
| checkpointCommitMetadata, metaClient); | ||
| } | ||
|
|
||
| success = writeClient.commit(instantTime, writeStatusRDD, Option.of(checkpointCommitMetadata), commitActionType, partitionToReplacedFileIds, Option.empty(), | ||
| Option.of(writeStatusValidator)); | ||
| } finally { | ||
| if (shouldUnpersist) { | ||
| writeStatusRDD.unpersist(); | ||
| } | ||
| } | ||
| releaseResourcesInvoked = true; | ||
| if (success) { | ||
| LOG.info("Commit " + instantTime + " successful!"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 If - Generated by an AI agent and may contain mistakes. Please verify any suggestions before applying. |
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 nit: this flatMap lambda has grown to ~16 lines doing class loading, type checking, reflection and two distinct exception translations. Could you extract it into a small private helper like
instantiateSparkValidator(String validatorClass, HoodieSparkTable table, HoodieEngineContext context, HoodieWriteConfig config)returningStream<SparkPreCommitValidator>? Would make the stream pipeline read at a glance.- AI-generated; verify before applying. React 👍/👎 to flag quality.