Spark 3.4: Support distributed planning

apache · Aug 29, 2023 · 530ddf1 · 530ddf1
1 parent eb0a535
commit 530ddf1
Show file tree

Hide file tree

Showing 18 changed files with 893 additions and 44 deletions.
diff --git a/core/src/main/java/org/apache/iceberg/BaseScan.java b/core/src/main/java/org/apache/iceberg/BaseScan.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.expressions.Binder;
 import org.apache.iceberg.expressions.Expression;
 import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.metrics.MetricsReporter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
@@ -58,7 +59,7 @@ abstract class BaseScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
           "upper_bounds",
           "column_sizes");
 
-  private static final List<String> SCAN_WITH_STATS_COLUMNS =
+  protected static final List<String> SCAN_WITH_STATS_COLUMNS =
       ImmutableList.<String>builder().addAll(SCAN_COLUMNS).addAll(STATS_COLUMNS).build();
 
   protected static final List<String> DELETE_SCAN_COLUMNS =
@@ -73,7 +74,8 @@ abstract class BaseScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
           "record_count",
           "partition",
           "key_metadata",
-          "split_offsets");
+          "split_offsets",
+          "equality_ids");
 
   protected static final List<String> DELETE_SCAN_WITH_STATS_COLUMNS =
       ImmutableList.<String>builder().addAll(DELETE_SCAN_COLUMNS).addAll(STATS_COLUMNS).build();
@@ -95,6 +97,10 @@ public Table table() {
     return table;
   }
 
+  public FileIO io() {
+    return table.io();
+  }
+
   protected Schema tableSchema() {
     return schema;
   }
@@ -111,10 +117,18 @@ protected List<String> scanColumns() {
     return context.returnColumnStats() ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS;
   }
 
+  protected boolean shouldReturnColumnStats() {
+    return context().returnColumnStats();
+  }
+
   protected boolean shouldIgnoreResiduals() {
     return context().ignoreResiduals();
   }
 
+  protected Expression residualFilter() {
+    return shouldIgnoreResiduals() ? Expressions.alwaysTrue() : filter();
+  }
+
   protected boolean shouldPlanWithExecutor() {
     return PLAN_SCANS_WITH_WORKER_POOL || context().planWithCustomizedExecutor();
   }

diff --git a/core/src/main/java/org/apache/iceberg/DataScan.java b/core/src/main/java/org/apache/iceberg/DataScan.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg;
+
+import java.util.List;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+
+abstract class DataScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>>
+    extends SnapshotScan<ThisT, T, G> {
+
+  protected DataScan(Table table, Schema schema, TableScanContext context) {
+    super(table, schema, context);
+  }
+
+  @Override
+  protected boolean useSnapshotSchema() {
+    return true;
+  }
+
+  protected ManifestGroup newManifestGroup(
+      List<ManifestFile> dataManifests, List<ManifestFile> deleteManifests) {
+    return newManifestGroup(dataManifests, deleteManifests, context().returnColumnStats());
+  }
+
+  protected ManifestGroup newManifestGroup(
+      List<ManifestFile> dataManifests, boolean withColumnStats) {
+    return newManifestGroup(dataManifests, ImmutableList.of(), withColumnStats);
+  }
+
+  protected ManifestGroup newManifestGroup(
+      List<ManifestFile> dataManifests,
+      List<ManifestFile> deleteManifests,
+      boolean withColumnStats) {
+
+    ManifestGroup manifestGroup =
+        new ManifestGroup(io(), dataManifests, deleteManifests)
+            .caseSensitive(isCaseSensitive())
+            .select(withColumnStats ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS)
+            .filterData(filter())
+            .specsById(table().specs())
+            .scanMetrics(scanMetrics())
+            .ignoreDeleted();
+
+    if (shouldIgnoreResiduals()) {
+      manifestGroup = manifestGroup.ignoreResiduals();
+    }
+
+    if (shouldPlanWithExecutor() && (dataManifests.size() > 1 || deleteManifests.size() > 1)) {
+      manifestGroup = manifestGroup.planWith(planExecutor());
+    }
+
+    return manifestGroup;
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/DataTableScan.java b/core/src/main/java/org/apache/iceberg/DataTableScan.java
@@ -22,7 +22,6 @@
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.util.SnapshotUtil;
 
 public class DataTableScan extends BaseTableScan {
   protected DataTableScan(Table table, Schema schema, TableScanContext context) {
@@ -52,12 +51,8 @@ public TableScan appendsAfter(long fromSnapshotId) {
   }
 
   @Override
-  public TableScan useSnapshot(long scanSnapshotId) {
-    // call method in superclass just for the side effect of argument validation;
-    // we do not use its return value
-    super.useSnapshot(scanSnapshotId);
-    Schema snapshotSchema = SnapshotUtil.schemaFor(table(), scanSnapshotId);
-    return newRefinedScan(table(), snapshotSchema, context().useSnapshotId(scanSnapshotId));
+  protected boolean useSnapshotSchema() {
+    return true;
   }
 
   @Override

diff --git a/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java b/core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
@@ -142,6 +142,10 @@ DeleteFile[] forEntry(ManifestEntry<DataFile> entry) {
     return forDataFile(entry.dataSequenceNumber(), entry.file());
   }
 
+  DeleteFile[] forDataFile(DataFile file) {
+    return forDataFile(file.dataSequenceNumber(), file);
+  }
+
   DeleteFile[] forDataFile(long sequenceNumber, DataFile file) {
     if (isEmpty) {
       return NO_DELETES;