Skip to content

Commit

Permalink
Spark 3.4: Support distributed planning
Browse files Browse the repository at this point in the history
  • Loading branch information
aokolnychyi committed Aug 29, 2023
1 parent eb0a535 commit 530ddf1
Show file tree
Hide file tree
Showing 18 changed files with 893 additions and 44 deletions.
18 changes: 16 additions & 2 deletions core/src/main/java/org/apache/iceberg/BaseScan.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.metrics.MetricsReporter;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
Expand Down Expand Up @@ -58,7 +59,7 @@ abstract class BaseScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
"upper_bounds",
"column_sizes");

private static final List<String> SCAN_WITH_STATS_COLUMNS =
protected static final List<String> SCAN_WITH_STATS_COLUMNS =
ImmutableList.<String>builder().addAll(SCAN_COLUMNS).addAll(STATS_COLUMNS).build();

protected static final List<String> DELETE_SCAN_COLUMNS =
Expand All @@ -73,7 +74,8 @@ abstract class BaseScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
"record_count",
"partition",
"key_metadata",
"split_offsets");
"split_offsets",
"equality_ids");

protected static final List<String> DELETE_SCAN_WITH_STATS_COLUMNS =
ImmutableList.<String>builder().addAll(DELETE_SCAN_COLUMNS).addAll(STATS_COLUMNS).build();
Expand All @@ -95,6 +97,10 @@ public Table table() {
return table;
}

public FileIO io() {
return table.io();
}

protected Schema tableSchema() {
return schema;
}
Expand All @@ -111,10 +117,18 @@ protected List<String> scanColumns() {
return context.returnColumnStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS;
}

protected boolean shouldReturnColumnStats() {
return context().returnColumnStats();
}

protected boolean shouldIgnoreResiduals() {
return context().ignoreResiduals();
}

protected Expression residualFilter() {
return shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter();
}

protected boolean shouldPlanWithExecutor() {
return PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor();
}
Expand Down
70 changes: 70 additions & 0 deletions core/src/main/java/org/apache/iceberg/DataScan.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;

import java.util.List;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;

abstract class DataScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
extends SnapshotScan<ThisT, T, G> {

protected DataScan(Table table, Schema schema, TableScanContext context) {
super(table, schema, context);
}

@Override
protected boolean useSnapshotSchema() {
return true;
}

protected ManifestGroup newManifestGroup(
List<ManifestFile> dataManifests, List<ManifestFile> deleteManifests) {
return newManifestGroup(dataManifests, deleteManifests, context().returnColumnStats());
}

protected ManifestGroup newManifestGroup(
List<ManifestFile> dataManifests, boolean withColumnStats) {
return newManifestGroup(dataManifests, ImmutableList.of(), withColumnStats);
}

protected ManifestGroup newManifestGroup(
List<ManifestFile> dataManifests,
List<ManifestFile> deleteManifests,
boolean withColumnStats) {

ManifestGroup manifestGroup =
new ManifestGroup(io(), dataManifests, deleteManifests)
.caseSensitive(isCaseSensitive())
.select(withColumnStats ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS)
.filterData(filter())
.specsById(table().specs())
.scanMetrics(scanMetrics())
.ignoreDeleted();

if (shouldIgnoreResiduals()) {
manifestGroup = manifestGroup.ignoreResiduals();
}

if (shouldPlanWithExecutor() && (dataManifests.size() > 1 || deleteManifests.size() > 1)) {
manifestGroup = manifestGroup.planWith(planExecutor());
}

return manifestGroup;
}
}
9 changes: 2 additions & 7 deletions core/src/main/java/org/apache/iceberg/DataTableScan.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.util.SnapshotUtil;

public class DataTableScan extends BaseTableScan {
protected DataTableScan(Table table, Schema schema, TableScanContext context) {
Expand Down Expand Up @@ -52,12 +51,8 @@ public TableScan appendsAfter(long fromSnapshotId) {
}

@Override
public TableScan useSnapshot(long scanSnapshotId) {
// call method in superclass just for the side effect of argument validation;
// we do not use its return value
super.useSnapshot(scanSnapshotId);
Schema snapshotSchema = SnapshotUtil.schemaFor(table(), scanSnapshotId);
return newRefinedScan(table(), snapshotSchema, context().useSnapshotId(scanSnapshotId));
protected boolean useSnapshotSchema() {
return true;
}

@Override
Expand Down
4 changes: 4 additions & 0 deletions core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ DeleteFile[] forEntry(ManifestEntry<DataFile> entry) {
return forDataFile(entry.dataSequenceNumber(), entry.file());
}

DeleteFile[] forDataFile(DataFile file) {
return forDataFile(file.dataSequenceNumber(), file);
}

DeleteFile[] forDataFile(long sequenceNumber, DataFile file) {
if (isEmpty) {
return NO_DELETES;
Expand Down

0 comments on commit 530ddf1

Please sign in to comment.