Skip to content

Commit

Permalink
Spark: Backport #6480 to Spark 3.2 and Spark 3.1 (#7425)
Browse files Browse the repository at this point in the history
  • Loading branch information
amogh-jahagirdar committed Apr 25, 2023
1 parent ce51d5f commit f917e28
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,15 @@ private List<FileScanTask> planFiles(StreamingOffset startOffset, StreamingOffse
currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false);
}

Snapshot snapshot = table.snapshot(currentOffset.snapshotId());

if (snapshot == null) {
throw new IllegalStateException(
String.format(
"Cannot load current offset at snapshot %d, the snapshot was expired or removed",
currentOffset.snapshotId()));
}

if (!shouldProcess(table.snapshot(currentOffset.snapshotId()))) {
LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name());
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
package org.apache.iceberg.spark.source;

import static org.apache.iceberg.expressions.Expressions.ref;
import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -323,6 +325,46 @@ public void testResumingStreamReadFromCheckpoint() throws Exception {
}
}

@Test
public void testFailReadingCheckpointInvalidSnapshot() throws IOException, TimeoutException {
File writerCheckpointFolder = temp.newFolder("writer-checkpoint-folder");
File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint");
File output = temp.newFolder();

DataStreamWriter querySource =
spark
.readStream()
.format("iceberg")
.load(tableName)
.writeStream()
.option("checkpointLocation", writerCheckpoint.toString())
.format("parquet")
.queryName("checkpoint_test")
.option("path", output.getPath());

List<SimpleRecord> firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one"));
List<SimpleRecord> secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two"));
StreamingQuery startQuery = querySource.start();

appendData(firstSnapshotRecordList);
table.refresh();
long firstSnapshotid = table.currentSnapshot().snapshotId();
startQuery.processAllAvailable();
startQuery.stop();

appendData(secondSnapshotRecordList);

table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit();

StreamingQuery restartedQuery = querySource.start();
assertThatThrownBy(restartedQuery::processAllAvailable)
.hasCauseInstanceOf(IllegalStateException.class)
.hasMessageContaining(
String.format(
"Cannot load current offset at snapshot %d, the snapshot was expired or removed",
firstSnapshotid));
}

@Test
public void testParquetOrcAvroDataInOneTable() throws Exception {
List<SimpleRecord> parquetFileRecords =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,15 @@ private List<FileScanTask> planFiles(StreamingOffset startOffset, StreamingOffse
currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false);
}

Snapshot snapshot = table.snapshot(currentOffset.snapshotId());

if (snapshot == null) {
throw new IllegalStateException(
String.format(
"Cannot load current offset at snapshot %d, the snapshot was expired or removed",
currentOffset.snapshotId()));
}

if (!shouldProcess(table.snapshot(currentOffset.snapshotId()))) {
LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name());
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
package org.apache.iceberg.spark.source;

import static org.apache.iceberg.expressions.Expressions.ref;
import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -323,6 +325,46 @@ public void testResumingStreamReadFromCheckpoint() throws Exception {
}
}

@Test
public void testFailReadingCheckpointInvalidSnapshot() throws IOException, TimeoutException {
File writerCheckpointFolder = temp.newFolder("writer-checkpoint-folder");
File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint");
File output = temp.newFolder();

DataStreamWriter querySource =
spark
.readStream()
.format("iceberg")
.load(tableName)
.writeStream()
.option("checkpointLocation", writerCheckpoint.toString())
.format("parquet")
.queryName("checkpoint_test")
.option("path", output.getPath());

List<SimpleRecord> firstSnapshotRecordList = Lists.newArrayList(new SimpleRecord(1, "one"));
List<SimpleRecord> secondSnapshotRecordList = Lists.newArrayList(new SimpleRecord(2, "two"));
StreamingQuery startQuery = querySource.start();

appendData(firstSnapshotRecordList);
table.refresh();
long firstSnapshotid = table.currentSnapshot().snapshotId();
startQuery.processAllAvailable();
startQuery.stop();

appendData(secondSnapshotRecordList);

table.expireSnapshots().expireSnapshotId(firstSnapshotid).commit();

StreamingQuery restartedQuery = querySource.start();
assertThatThrownBy(restartedQuery::processAllAvailable)
.hasCauseInstanceOf(IllegalStateException.class)
.hasMessageContaining(
String.format(
"Cannot load current offset at snapshot %d, the snapshot was expired or removed",
firstSnapshotid));
}

@Test
public void testParquetOrcAvroDataInOneTable() throws Exception {
List<SimpleRecord> parquetFileRecords =
Expand Down

0 comments on commit f917e28

Please sign in to comment.