-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark: fix regression from scan refactor #5143
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,26 +34,24 @@ | |
| import org.apache.iceberg.util.ThreadPools; | ||
| import org.apache.spark.api.java.JavaSparkContext; | ||
| import org.apache.spark.broadcast.Broadcast; | ||
| import org.apache.spark.sql.SparkSession; | ||
| import org.apache.spark.sql.connector.read.Batch; | ||
| import org.apache.spark.sql.connector.read.InputPartition; | ||
| import org.apache.spark.sql.connector.read.PartitionReaderFactory; | ||
|
|
||
| class SparkBatch implements Batch { | ||
| abstract class SparkBatch implements Batch { | ||
|
|
||
| private final JavaSparkContext sparkContext; | ||
| private final Table table; | ||
| private final SparkReadConf readConf; | ||
| private final List<CombinedScanTask> tasks; | ||
| private final Schema expectedSchema; | ||
| private final boolean caseSensitive; | ||
| private final boolean localityEnabled; | ||
|
|
||
| SparkBatch(JavaSparkContext sparkContext, Table table, SparkReadConf readConf, | ||
| List<CombinedScanTask> tasks, Schema expectedSchema) { | ||
| this.sparkContext = sparkContext; | ||
| SparkBatch(SparkSession spark, Table table, SparkReadConf readConf, Schema expectedSchema) { | ||
| this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); | ||
| this.table = table; | ||
| this.readConf = readConf; | ||
| this.tasks = tasks; | ||
| this.expectedSchema = expectedSchema; | ||
| this.caseSensitive = readConf.caseSensitive(); | ||
| this.localityEnabled = readConf.localityEnabled(); | ||
|
|
@@ -65,18 +63,24 @@ public InputPartition[] planInputPartitions() { | |
| Broadcast<Table> tableBroadcast = sparkContext.broadcast(SerializableTable.copyOf(table)); | ||
| String expectedSchemaString = SchemaParser.toJson(expectedSchema); | ||
|
|
||
| InputPartition[] readTasks = new InputPartition[tasks.size()]; | ||
| InputPartition[] readTasks = new InputPartition[tasks().size()]; | ||
|
|
||
| Tasks.range(readTasks.length) | ||
| .stopOnFailure() | ||
| .executeWith(localityEnabled ? ThreadPools.getWorkerPool() : null) | ||
| .run(index -> readTasks[index] = new ReadTask( | ||
| tasks.get(index), tableBroadcast, expectedSchemaString, | ||
| tasks().get(index), tableBroadcast, expectedSchemaString, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [minor] task() is synchronized, and we might be accessing here with a pool, should remove synchronized from function def and make it like : I think now we might also want to make
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like we weren't synchronizing before in 0.13, but I can add that check if you think it's needed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, as per my understanding, was thinking, we would be requiring it, as we have a possibility to call tasks() via mt (as we were in a thread pool), Though I see now in L66 above we already would have called tasks() which would have populated this All in all would say I don't have a strong reason for it, would say it's just something that caught my eye :) !
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looked like there are other cases where concurrent access would cause issues so I decided just to revert it back to the way it was in 0.13 in principle. For example in |
||
| caseSensitive, localityEnabled)); | ||
|
|
||
| return readTasks; | ||
| } | ||
|
|
||
| protected abstract List<CombinedScanTask> tasks(); | ||
|
|
||
| protected JavaSparkContext sparkContext() { | ||
| return sparkContext; | ||
| } | ||
|
|
||
| @Override | ||
| public PartitionReaderFactory createReaderFactory() { | ||
| return new ReaderFactory(batchSize()); | ||
|
|
@@ -93,7 +97,7 @@ private int batchSize() { | |
| } | ||
|
|
||
| private boolean parquetOnly() { | ||
| return tasks.stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); | ||
| return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.PARQUET)); | ||
| } | ||
|
|
||
| private boolean parquetBatchReadsEnabled() { | ||
|
|
@@ -103,12 +107,12 @@ private boolean parquetBatchReadsEnabled() { | |
| } | ||
|
|
||
| private boolean orcOnly() { | ||
| return tasks.stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); | ||
| return tasks().stream().allMatch(task -> !task.isDataTask() && onlyFileFormat(task, FileFormat.ORC)); | ||
| } | ||
|
|
||
| private boolean orcBatchReadsEnabled() { | ||
| return readConf.orcVectorizationEnabled() && // vectorization enabled | ||
| tasks.stream().noneMatch(TableScanUtil::hasDeletes); // no delete files | ||
| tasks().stream().noneMatch(TableScanUtil::hasDeletes); // no delete files | ||
| } | ||
|
|
||
| private boolean onlyFileFormat(CombinedScanTask task, FileFormat fileFormat) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,7 +40,6 @@ | |
| import org.apache.iceberg.spark.SparkSchemaUtil; | ||
| import org.apache.iceberg.spark.SparkUtil; | ||
| import org.apache.iceberg.util.PropertyUtil; | ||
| import org.apache.spark.api.java.JavaSparkContext; | ||
| import org.apache.spark.broadcast.Broadcast; | ||
| import org.apache.spark.sql.SparkSession; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
|
|
@@ -57,10 +56,9 @@ | |
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| abstract class SparkScan implements Scan, SupportsReportStatistics { | ||
| abstract class SparkScan extends SparkBatch implements Scan, SupportsReportStatistics { | ||
| private static final Logger LOG = LoggerFactory.getLogger(SparkScan.class); | ||
|
|
||
| private final JavaSparkContext sparkContext; | ||
| private final Table table; | ||
| private final SparkReadConf readConf; | ||
| private final boolean caseSensitive; | ||
|
|
@@ -69,14 +67,14 @@ abstract class SparkScan implements Scan, SupportsReportStatistics { | |
| private final boolean readTimestampWithoutZone; | ||
|
|
||
| // lazy variables | ||
| private StructType readSchema = null; | ||
| private StructType readSchema; | ||
|
|
||
| SparkScan(SparkSession spark, Table table, SparkReadConf readConf, | ||
| Schema expectedSchema, List<Expression> filters) { | ||
| super(spark, table, readConf, expectedSchema); | ||
|
|
||
| SparkSchemaUtil.validateMetadataColumnReferences(table.schema(), expectedSchema); | ||
|
|
||
| this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); | ||
| this.table = table; | ||
| this.readConf = readConf; | ||
| this.caseSensitive = readConf.caseSensitive(); | ||
|
|
@@ -101,16 +99,14 @@ protected List<Expression> filterExpressions() { | |
| return filterExpressions; | ||
| } | ||
|
|
||
| protected abstract List<CombinedScanTask> tasks(); | ||
|
|
||
| @Override | ||
| public Batch toBatch() { | ||
| return new SparkBatch(sparkContext, table, readConf, tasks(), expectedSchema); | ||
| return this; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice !, this is in sync with built in v2 file Sources: https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala#L197
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My suspicion is that there is some check that the scan implements Batch, or some other equality check someplace, that is required for filter pushdown. |
||
| } | ||
|
|
||
| @Override | ||
| public MicroBatchStream toMicroBatchStream(String checkpointLocation) { | ||
| return new SparkMicroBatchStream(sparkContext, table, readConf, expectedSchema, checkpointLocation); | ||
| return new SparkMicroBatchStream(sparkContext(), table, readConf, expectedSchema, checkpointLocation); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1147,7 +1147,7 @@ public void testUseLocalIterator() { | |
| checkExpirationResults(1L, 0L, 0L, 1L, 2L, results); | ||
|
|
||
| Assert.assertEquals("Expected total number of jobs with stream-results should match the expected number", | ||
| 5L, jobsRunDuringStreamResults); | ||
| 4L, jobsRunDuringStreamResults); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Spark is planning the scan more optimally by reusing BatchScanExecs, so there are only 4 tasks. However if anyone has more in depth knowledge of this assertion, it would be good to double check this is correct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you explain what the 4 task is? I'm really confused, thanks a lot |
||
| }); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: is there any concern with calling
tasks()repeatedly like this?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ja, in some Batch like fileBatch, tasks are recomputed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The subclasses all lazily instantiate tasks only once AFAIK