-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark 3.2: Refactor certain actions #4469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,22 +87,20 @@ public class BaseDeleteOrphanFilesSparkAction | |
| } | ||
| }, DataTypes.StringType); | ||
|
|
||
| private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; | ||
|
|
||
| private final SerializableConfiguration hadoopConf; | ||
| private final int partitionDiscoveryParallelism; | ||
| private final Table table; | ||
|
|
||
| private String location = null; | ||
| private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); | ||
| private Consumer<String> deleteFunc = new Consumer<String>() { | ||
| private final Consumer<String> defaultDelete = new Consumer<String>() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In most other places, we have a separate |
||
| @Override | ||
| public void accept(String file) { | ||
| table.io().deleteFile(file); | ||
| } | ||
| }; | ||
|
|
||
| private ExecutorService deleteExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; | ||
| private String location = null; | ||
| private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); | ||
| private Consumer<String> deleteFunc = defaultDelete; | ||
| private ExecutorService deleteExecutorService = null; | ||
|
|
||
| public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { | ||
| super(spark); | ||
|
|
@@ -114,7 +112,7 @@ public BaseDeleteOrphanFilesSparkAction(SparkSession spark, Table table) { | |
|
|
||
| ValidationException.check( | ||
| PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), | ||
| "Cannot remove orphan files: GC is disabled (deleting files may corrupt other tables)"); | ||
| "Cannot delete orphan files: GC is disabled (deleting files may corrupt other tables)"); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I renamed the action a while ago but missed some places. |
||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -148,7 +146,7 @@ public BaseDeleteOrphanFilesSparkAction deleteWith(Consumer<String> newDeleteFun | |
|
|
||
| @Override | ||
| public DeleteOrphanFiles.Result execute() { | ||
| JobGroupInfo info = newJobGroupInfo("REMOVE-ORPHAN-FILES", jobDesc()); | ||
| JobGroupInfo info = newJobGroupInfo("DELETE-ORPHAN-FILES", jobDesc()); | ||
| return withJobGroupInfo(info, this::doExecute); | ||
| } | ||
|
|
||
|
|
@@ -158,7 +156,7 @@ private String jobDesc() { | |
| if (location != null) { | ||
| options.add("location=" + location); | ||
| } | ||
| return String.format("Removing orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); | ||
| return String.format("Deleting orphan files (%s) from %s", Joiner.on(',').join(options), table.name()); | ||
| } | ||
|
|
||
| private DeleteOrphanFiles.Result doExecute() { | ||
|
|
@@ -167,10 +165,10 @@ private DeleteOrphanFiles.Result doExecute() { | |
| Dataset<Row> validFileDF = validContentFileDF.union(validMetadataFileDF); | ||
| Dataset<Row> actualFileDF = buildActualFileDF(); | ||
|
|
||
| Column actualFileName = filenameUDF.apply(actualFileDF.col("file_path")); | ||
| Column validFileName = filenameUDF.apply(validFileDF.col("file_path")); | ||
| Column actualFileName = filenameUDF.apply(actualFileDF.col(FILE_PATH)); | ||
| Column validFileName = filenameUDF.apply(validFileDF.col(FILE_PATH)); | ||
| Column nameEqual = actualFileName.equalTo(validFileName); | ||
| Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); | ||
| Column actualContains = actualFileDF.col(FILE_PATH).contains(validFileDF.col(FILE_PATH)); | ||
| Column joinCond = nameEqual.and(actualContains); | ||
| List<String> orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") | ||
| .as(Encoders.STRING()) | ||
|
|
@@ -198,7 +196,7 @@ private Dataset<Row> buildActualFileDF() { | |
| JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); | ||
|
|
||
| if (subDirs.isEmpty()) { | ||
| return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); | ||
| return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF(FILE_PATH); | ||
| } | ||
|
|
||
| int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism); | ||
|
|
@@ -208,7 +206,7 @@ private Dataset<Row> buildActualFileDF() { | |
| JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); | ||
|
|
||
| JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); | ||
| return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); | ||
| return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF(FILE_PATH); | ||
| } | ||
|
|
||
| private static void listDirRecursively( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,11 +20,9 @@ | |
| package org.apache.iceberg.spark.actions; | ||
|
|
||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.concurrent.ExecutorService; | ||
| import java.util.concurrent.atomic.AtomicLong; | ||
| import java.util.function.Consumer; | ||
| import org.apache.iceberg.ReachableFileUtil; | ||
| import org.apache.iceberg.Table; | ||
| import org.apache.iceberg.TableMetadata; | ||
| import org.apache.iceberg.TableMetadataParser; | ||
|
|
@@ -35,15 +33,12 @@ | |
| import org.apache.iceberg.hadoop.HadoopFileIO; | ||
| import org.apache.iceberg.io.FileIO; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.spark.JobGroupInfo; | ||
| import org.apache.iceberg.util.PropertyUtil; | ||
| import org.apache.iceberg.util.Tasks; | ||
| import org.apache.spark.sql.Dataset; | ||
| import org.apache.spark.sql.Encoders; | ||
| import org.apache.spark.sql.Row; | ||
| import org.apache.spark.sql.SparkSession; | ||
| import org.apache.spark.sql.functions; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
|
|
@@ -57,37 +52,27 @@ | |
| @SuppressWarnings("UnnecessaryAnonymousClass") | ||
| public class BaseDeleteReachableFilesSparkAction | ||
| extends BaseSparkAction<DeleteReachableFiles, DeleteReachableFiles.Result> implements DeleteReachableFiles { | ||
| private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); | ||
|
|
||
| private static final String CONTENT_FILE = "Content File"; | ||
| private static final String MANIFEST = "Manifest"; | ||
| private static final String MANIFEST_LIST = "Manifest List"; | ||
| private static final String OTHERS = "Others"; | ||
|
|
||
| private static final String STREAM_RESULTS = "stream-results"; | ||
| public static final String STREAM_RESULTS = "stream-results"; | ||
| public static final boolean STREAM_RESULTS_DEFAULT = false; | ||
|
|
||
| // Creates an executor service that runs each task in the thread that invokes execute/submit. | ||
| private static final ExecutorService DEFAULT_DELETE_EXECUTOR_SERVICE = null; | ||
|
|
||
| private final TableMetadata tableMetadata; | ||
| private static final Logger LOG = LoggerFactory.getLogger(BaseDeleteReachableFilesSparkAction.class); | ||
|
|
||
| private final String metadataFileLocation; | ||
| private final Consumer<String> defaultDelete = new Consumer<String>() { | ||
| @Override | ||
| public void accept(String file) { | ||
| io.deleteFile(file); | ||
| } | ||
| }; | ||
|
|
||
| private Consumer<String> removeFunc = defaultDelete; | ||
| private ExecutorService removeExecutorService = DEFAULT_DELETE_EXECUTOR_SERVICE; | ||
| private Consumer<String> deleteFunc = defaultDelete; | ||
| private ExecutorService deleteExecutorService = null; | ||
| private FileIO io = new HadoopFileIO(spark().sessionState().newHadoopConf()); | ||
|
|
||
| public BaseDeleteReachableFilesSparkAction(SparkSession spark, String metadataLocation) { | ||
| public BaseDeleteReachableFilesSparkAction(SparkSession spark, String metadataFileLocation) { | ||
| super(spark); | ||
| this.tableMetadata = TableMetadataParser.read(io, metadataLocation); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not try to read the metadata using the default |
||
| ValidationException.check( | ||
| PropertyUtil.propertyAsBoolean(tableMetadata.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), | ||
| "Cannot remove files: GC is disabled (deleting files may corrupt other tables)"); | ||
| this.metadataFileLocation = metadataFileLocation; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -102,54 +87,48 @@ public DeleteReachableFiles io(FileIO fileIO) { | |
| } | ||
|
|
||
| @Override | ||
| public DeleteReachableFiles deleteWith(Consumer<String> deleteFunc) { | ||
| this.removeFunc = deleteFunc; | ||
| public DeleteReachableFiles deleteWith(Consumer<String> newDeleteFunc) { | ||
| this.deleteFunc = newDeleteFunc; | ||
| return this; | ||
|
|
||
| } | ||
|
|
||
| @Override | ||
| public DeleteReachableFiles executeDeleteWith(ExecutorService executorService) { | ||
| this.removeExecutorService = executorService; | ||
| this.deleteExecutorService = executorService; | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public Result execute() { | ||
| Preconditions.checkArgument(io != null, "File IO cannot be null"); | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| String msg = String.format("Removing files reachable from %s", tableMetadata.metadataFileLocation()); | ||
| JobGroupInfo info = newJobGroupInfo("REMOVE-FILES", msg); | ||
| String jobDesc = String.format("Deleting files reachable from %s", metadataFileLocation); | ||
| JobGroupInfo info = newJobGroupInfo("DELETE-REACHABLE-FILES", jobDesc); | ||
| return withJobGroupInfo(info, this::doExecute); | ||
| } | ||
|
|
||
| private Result doExecute() { | ||
| boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, false); | ||
| Dataset<Row> validFileDF = buildValidFileDF(tableMetadata).distinct(); | ||
| TableMetadata metadata = TableMetadataParser.read(io, metadataFileLocation); | ||
|
|
||
| ValidationException.check( | ||
| PropertyUtil.propertyAsBoolean(metadata.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), | ||
| "Cannot delete files: GC is disabled (deleting files may corrupt other tables)"); | ||
|
|
||
| Dataset<Row> reachableFileDF = buildReachableFileDF(metadata).distinct(); | ||
|
|
||
| boolean streamResults = PropertyUtil.propertyAsBoolean(options(), STREAM_RESULTS, STREAM_RESULTS_DEFAULT); | ||
| if (streamResults) { | ||
| return deleteFiles(validFileDF.toLocalIterator()); | ||
| return deleteFiles(reachableFileDF.toLocalIterator()); | ||
| } else { | ||
| return deleteFiles(validFileDF.collectAsList().iterator()); | ||
| return deleteFiles(reachableFileDF.collectAsList().iterator()); | ||
| } | ||
| } | ||
|
|
||
| private Dataset<Row> projectFilePathWithType(Dataset<Row> ds, String type) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are multiple actions that define the exact same method. |
||
| return ds.select(functions.col("file_path"), functions.lit(type).as("file_type")); | ||
| } | ||
|
|
||
| private Dataset<Row> buildValidFileDF(TableMetadata metadata) { | ||
| private Dataset<Row> buildReachableFileDF(TableMetadata metadata) { | ||
| Table staticTable = newStaticTable(metadata, io); | ||
| return projectFilePathWithType(buildValidContentFileDF(staticTable), CONTENT_FILE) | ||
| .union(projectFilePathWithType(buildManifestFileDF(staticTable), MANIFEST)) | ||
| .union(projectFilePathWithType(buildManifestListDF(staticTable), MANIFEST_LIST)) | ||
| .union(projectFilePathWithType(buildOtherMetadataFileDF(staticTable), OTHERS)); | ||
| } | ||
|
|
||
| @Override | ||
| protected Dataset<Row> buildOtherMetadataFileDF(Table table) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This used to overload a method in the base class. I defined |
||
| List<String> otherMetadataFiles = Lists.newArrayList(); | ||
| otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, true)); | ||
| otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table)); | ||
| return spark().createDataset(otherMetadataFiles, Encoders.STRING()).toDF("file_path"); | ||
| return withFileType(buildValidContentFileDF(staticTable), CONTENT_FILE) | ||
| .union(withFileType(buildManifestFileDF(staticTable), MANIFEST)) | ||
| .union(withFileType(buildManifestListDF(staticTable), MANIFEST_LIST)) | ||
| .union(withFileType(buildAllReachableOtherMetadataFileDF(staticTable), OTHERS)); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -166,7 +145,7 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator<Row> deleted) | |
|
|
||
| Tasks.foreach(deleted) | ||
| .retry(3).stopRetryOn(NotFoundException.class).suppressFailureWhenFinished() | ||
| .executeWith(removeExecutorService) | ||
| .executeWith(deleteExecutorService) | ||
| .onFailure((fileInfo, exc) -> { | ||
| String file = fileInfo.getString(0); | ||
| String type = fileInfo.getString(1); | ||
|
|
@@ -175,7 +154,7 @@ private BaseDeleteReachableFilesActionResult deleteFiles(Iterator<Row> deleted) | |
| .run(fileInfo -> { | ||
| String file = fileInfo.getString(0); | ||
| String type = fileInfo.getString(1); | ||
| removeFunc.accept(file); | ||
| deleteFunc.accept(file); | ||
| switch (type) { | ||
| case CONTENT_FILE: | ||
| dataFileCount.incrementAndGet(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see very little value in this constant, to be honest.