-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Combine tasks to scan up to target split size using parquet row group information #204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,10 @@ | |
| import com.google.common.annotations.VisibleForTesting; | ||
| import com.google.common.base.Objects; | ||
| import com.google.common.collect.ImmutableList; | ||
| import java.util.ArrayList; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.NoSuchElementException; | ||
| import org.apache.iceberg.expressions.Expression; | ||
| import org.apache.iceberg.expressions.ResidualEvaluator; | ||
|
|
||
|
|
@@ -71,12 +73,12 @@ public Expression residual() { | |
| } | ||
|
|
||
| @Override | ||
| public Iterable<FileScanTask> split(long splitSize) { | ||
| public Iterable<FileScanTask> split(long targetSplitSize) { | ||
| if (file.format().isSplittable()) { | ||
| if (file.splitOffsets() != null) { | ||
| return () -> new OffsetsBasedSplitScanTaskIterator(file.splitOffsets(), this); | ||
| return () -> new OffsetsAwareTargetSplitSizeScanTaskIterator(file.splitOffsets(), this, targetSplitSize); | ||
| } else { | ||
| return () -> new FixedSizeSplitScanTaskIterator(splitSize, this); | ||
| return () -> new FixedSizeSplitScanTaskIterator(targetSplitSize, this); | ||
| } | ||
| } | ||
| return ImmutableList.of(this); | ||
|
|
@@ -91,29 +93,51 @@ public String toString() { | |
| .toString(); | ||
| } | ||
|
|
||
| /** | ||
| * This iterator returns {@link FileScanTask} using guidance provided by split offsets. | ||
| */ | ||
| @VisibleForTesting | ||
| static final class OffsetsBasedSplitScanTaskIterator implements Iterator<FileScanTask> { | ||
| private final List<Long> splitOffsets; | ||
| static final class OffsetsAwareTargetSplitSizeScanTaskIterator implements Iterator<FileScanTask> { | ||
| private final List<Long> offsets; | ||
| private final List<Long> splitSizes; | ||
| private final FileScanTask parentScanTask; | ||
| private int idx = 0; | ||
|
|
||
| OffsetsBasedSplitScanTaskIterator(List<Long> splitOffsets, FileScanTask fileScanTask) { | ||
| this.splitOffsets = splitOffsets; | ||
| this.parentScanTask = fileScanTask; | ||
| private final long targetSplitSize; | ||
| private int sizeIdx = 0; | ||
|
|
||
| OffsetsAwareTargetSplitSizeScanTaskIterator( | ||
| List<Long> offsetList, FileScanTask parentScanTask, long targetSplitSize) { | ||
| this.offsets = ImmutableList.copyOf(offsetList); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missed this the first time: why copy the offset list? It shouldn't change.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just being a bit defensive. Future implementations of the DataFile interface may not create an immutable copy before passing it downstream.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems fine since this should be an ImmutableList in the current read path and won't actually copy. |
||
| this.parentScanTask = parentScanTask; | ||
| this.targetSplitSize = targetSplitSize; | ||
| this.splitSizes = new ArrayList<>(offsets.size()); | ||
| int lastIndex = offsets.size() - 1; | ||
| for (int index = 0; index < lastIndex; index++) { | ||
| splitSizes.add(offsets.get(index + 1) - offsets.get(index)); | ||
| } | ||
| splitSizes.add(parentScanTask.length() - offsets.get(lastIndex)); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean hasNext() { | ||
| return idx < splitOffsets.size(); | ||
| return sizeIdx < splitSizes.size(); | ||
| } | ||
|
|
||
| @Override | ||
| public FileScanTask next() { | ||
| long start = splitOffsets.get(idx); | ||
| idx++; | ||
| long end = hasNext() ? splitOffsets.get(idx) : parentScanTask.length(); | ||
| return new SplitScanTask(start, end - start, parentScanTask); | ||
| if (!hasNext()) { | ||
| throw new NoSuchElementException(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for correctly implementing the contract! |
||
| } | ||
| int offsetIdx = sizeIdx; | ||
| long currentSize = splitSizes.get(sizeIdx); | ||
| sizeIdx += 1; // always consume at least one file split | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
| while (sizeIdx < splitSizes.size() && currentSize + splitSizes.get(sizeIdx) <= targetSplitSize) { | ||
| currentSize += splitSizes.get(sizeIdx); | ||
| sizeIdx += 1; | ||
| } | ||
| FileScanTask combinedTask = new SplitScanTask(offsets.get(offsetIdx), currentSize, parentScanTask); | ||
| return combinedTask; | ||
| } | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: another blank line |
||
| } | ||
|
|
||
| @VisibleForTesting | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: style should be: