-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-49857][SQL] Add storageLevel to Dataset localCheckpoint API #48324
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
45939da
1b9ea26
a153d9d
adde2e9
4c38a77
039e2be
94890e0
6e2247f
07760a3
71fd29a
730cb77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ import org.scalatest.exceptions.TestFailedDueToTimeoutException | |
import org.apache.spark.SparkException | ||
import org.apache.spark.connect.proto | ||
import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession, SQLHelper} | ||
import org.apache.spark.storage.StorageLevel | ||
|
||
class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHelper { | ||
|
||
|
@@ -50,12 +51,20 @@ class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHe | |
checkFragments(captureStdOut(block), fragmentsToCheck) | ||
} | ||
|
||
test("checkpoint") { | ||
test("localCheckpoint") { | ||
val df = spark.range(100).localCheckpoint() | ||
testCapturedStdOut(df.explain(), "ExistingRDD") | ||
} | ||
|
||
test("checkpoint gc") { | ||
test("localCheckpoint with StorageLevel") { | ||
// We don't have a way to reach into the server and assert the storage level server side, but | ||
// this test should cover for unexpected errors in the API. | ||
Comment on lines
+60
to
+61
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hvanhovell with the SQL API refactoring, would it be now possible to have tests that use a connect client to self-connect, and have server side objects (SparkContext) etc. available inside the test to verify? The existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @juliuszsompolski it will take a few more PRs, but yeah that is the objective. |
||
val df = | ||
spark.range(100).localCheckpoint(eager = true, storageLevel = StorageLevel.DISK_ONLY) | ||
df.collect() | ||
} | ||
|
||
test("localCheckpoint gc") { | ||
val df = spark.range(100).localCheckpoint(eager = true) | ||
val encoder = df.agnosticEncoder | ||
val dfId = df.plan.getRoot.getCachedRemoteRelation.getRelationId | ||
|
@@ -77,7 +86,7 @@ class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHe | |
|
||
// This test is flaky because cannot guarantee GC | ||
// You can locally run this to verify the behavior. | ||
ignore("checkpoint gc derived DataFrame") { | ||
ignore("localCheckpoint gc derived DataFrame") { | ||
var df1 = spark.range(100).localCheckpoint(eager = true) | ||
var derived = df1.repartition(10) | ||
val encoder = df1.agnosticEncoder | ||
|
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -951,11 +951,17 @@ def test_union_classmethod_usage(self): | |
def test_isinstance_dataframe(self): | ||
self.assertIsInstance(self.spark.range(1), DataFrame) | ||
|
||
def test_checkpoint_dataframe(self): | ||
def test_local_checkpoint_dataframe(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note: there are no tests at all for reliable checkpoint in pyspark API. I renamed this test accordingly. |
||
with io.StringIO() as buf, redirect_stdout(buf): | ||
self.spark.range(1).localCheckpoint().explain() | ||
self.assertIn("ExistingRDD", buf.getvalue()) | ||
|
||
def test_local_checkpoint_dataframe_with_storage_level(self): | ||
# We don't have a way to reach into the server and assert the storage level server side, but | ||
# this test should cover for unexpected errors in the API. | ||
df = self.spark.range(10).localCheckpoint(eager=True, storageLevel=StorageLevel.DISK_ONLY) | ||
df.collect() | ||
|
||
def test_transpose(self): | ||
df = self.spark.createDataFrame([{"a": "x", "b": "y", "c": "z"}]) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: there are no tests that test Connect reliable checkpoint. I renamed this test accordingly.