apache · juliuszsompolski · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 3, 2024
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1115,13 +1115,20 @@ class Dataset[T] private[sql] (
   }
 
   /** @inheritdoc */
-  protected def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = {
+  protected def checkpoint(
+      eager: Boolean,
+      reliableCheckpoint: Boolean,
+      storageLevel: Option[StorageLevel]): Dataset[T] = {
     sparkSession.newDataset(agnosticEncoder) { builder =>
       val command = sparkSession.newCommand { builder =>
-        builder.getCheckpointCommandBuilder
+        val checkpointBuilder = builder.getCheckpointCommandBuilder
           .setLocal(!reliableCheckpoint)
           .setEager(eager)
           .setRelation(this.plan.getRoot)
+        storageLevel.foreach { storageLevel =>
+          checkpointBuilder.setStorageLevel(
+            StorageLevelProtoConverter.toConnectProtoType(storageLevel))
+        }
       }
       val responseIter = sparkSession.execute(command)
       try {
@@ -1304,6 +1311,10 @@ class Dataset[T] private[sql] (
   /** @inheritdoc */
   override def localCheckpoint(eager: Boolean): Dataset[T] = super.localCheckpoint(eager)
 
+  /** @inheritdoc */
+  override def localCheckpoint(eager: Boolean, storageLevel: StorageLevel): Dataset[T] =
+    super.localCheckpoint(eager, storageLevel)
+
   /** @inheritdoc */
   override def joinWith[U](other: Dataset[U], condition: Column): Dataset[(T, U)] =
     super.joinWith(other, condition)

diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CheckpointSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/CheckpointSuite.scala
@@ -27,6 +27,7 @@ import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.apache.spark.SparkException
 import org.apache.spark.connect.proto
 import org.apache.spark.sql.test.{ConnectFunSuite, RemoteSparkSession, SQLHelper}
+import org.apache.spark.storage.StorageLevel
 
 class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHelper {
 
@@ -50,12 +51,20 @@ class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHe
     checkFragments(captureStdOut(block), fragmentsToCheck)
   }
 
-  test("checkpoint") {
+  test("localCheckpoint") {
     val df = spark.range(100).localCheckpoint()
     testCapturedStdOut(df.explain(), "ExistingRDD")
   }
 
-  test("checkpoint gc") {
+  test("localCheckpoint with StorageLevel") {
+    // We don't have a way to reach into the server and assert the storage level server side, but
+    // this test should cover for unexpected errors in the API.
+    val df =
+      spark.range(100).localCheckpoint(eager = true, storageLevel = StorageLevel.DISK_ONLY)
+    df.collect()
+  }
+
+  test("localCheckpoint gc") {
     val df = spark.range(100).localCheckpoint(eager = true)
     val encoder = df.agnosticEncoder
     val dfId = df.plan.getRoot.getCachedRemoteRelation.getRelationId
@@ -77,7 +86,7 @@ class CheckpointSuite extends ConnectFunSuite with RemoteSparkSession with SQLHe
 
   // This test is flaky because cannot guarantee GC
   // You can locally run this to verify the behavior.
-  ignore("checkpoint gc derived DataFrame") {
+  ignore("localCheckpoint gc derived DataFrame") {
     var df1 = spark.range(100).localCheckpoint(eager = true)
     var derived = df1.repartition(10)
     val encoder = df1.agnosticEncoder

diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py
@@ -360,8 +360,13 @@ def checkpoint(self, eager: bool = True) -> ParentDataFrame:
         jdf = self._jdf.checkpoint(eager)
         return DataFrame(jdf, self.sparkSession)
 
-    def localCheckpoint(self, eager: bool = True) -> ParentDataFrame:
-        jdf = self._jdf.localCheckpoint(eager)
+    def localCheckpoint(
+        self, eager: bool = True, storageLevel: Optional[StorageLevel] = None
+    ) -> ParentDataFrame:
+        if storageLevel is None:
+            jdf = self._jdf.localCheckpoint(eager)
+        else:
+            jdf = self._jdf.localCheckpoint(eager, self._sc._getJavaStorageLevel(storageLevel))
         return DataFrame(jdf, self.sparkSession)
 
     def withWatermark(self, eventTime: str, delayThreshold: str) -> ParentDataFrame:

diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -2173,8 +2173,10 @@ def checkpoint(self, eager: bool = True) -> ParentDataFrame:
         assert isinstance(checkpointed._plan, plan.CachedRemoteRelation)
         return checkpointed
 
-    def localCheckpoint(self, eager: bool = True) -> ParentDataFrame:
-        cmd = plan.Checkpoint(child=self._plan, local=True, eager=eager)
+    def localCheckpoint(
+        self, eager: bool = True, storageLevel: Optional[StorageLevel] = None
+    ) -> ParentDataFrame:
+        cmd = plan.Checkpoint(child=self._plan, local=True, eager=eager, storage_level=storageLevel)
         _, properties, self._execution_info = self._session.client.execute_command(
             cmd.command(self._session.client)
         )

diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
@@ -1868,21 +1868,29 @@ def command(self, session: "SparkConnectClient") -> proto.Command:
 
 
 class Checkpoint(LogicalPlan):
-    def __init__(self, child: Optional["LogicalPlan"], local: bool, eager: bool) -> None:
+    def __init__(
+        self,
+        child: Optional["LogicalPlan"],
+        local: bool,
+        eager: bool,
+        storage_level: Optional[StorageLevel] = None,
+    ) -> None:
         super().__init__(child)
         self._local = local
         self._eager = eager
+        self._storage_level = storage_level
 
     def command(self, session: "SparkConnectClient") -> proto.Command:
         cmd = proto.Command()
         assert self._child is not None
-        cmd.checkpoint_command.CopyFrom(
-            proto.CheckpointCommand(
-                relation=self._child.plan(session),
-                local=self._local,
-                eager=self._eager,
-            )
+        checkpoint_command = proto.CheckpointCommand(
+            relation=self._child.plan(session),
+            local=self._local,
+            eager=self._eager,
         )
+        if self._storage_level is not None:
+            checkpoint_command.storage_level.CopyFrom(storage_level_to_proto(self._storage_level))
+        cmd.checkpoint_command.CopyFrom(checkpoint_command)
         return cmd
 
 

diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py
diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi
@@ -2188,6 +2188,7 @@ class CheckpointCommand(google.protobuf.message.Message):
     RELATION_FIELD_NUMBER: builtins.int
     LOCAL_FIELD_NUMBER: builtins.int
     EAGER_FIELD_NUMBER: builtins.int
+    STORAGE_LEVEL_FIELD_NUMBER: builtins.int
     @property
     def relation(self) -> pyspark.sql.connect.proto.relations_pb2.Relation:
         """(Required) The logical plan to checkpoint."""
@@ -2197,22 +2198,46 @@ class CheckpointCommand(google.protobuf.message.Message):
     """
     eager: builtins.bool
     """(Required) Whether to checkpoint this dataframe immediately."""
+    @property
+    def storage_level(self) -> pyspark.sql.connect.proto.common_pb2.StorageLevel:
+        """(Optional) For local checkpoint, the storage level to use."""
     def __init__(
         self,
         *,
         relation: pyspark.sql.connect.proto.relations_pb2.Relation | None = ...,
         local: builtins.bool = ...,
         eager: builtins.bool = ...,
+        storage_level: pyspark.sql.connect.proto.common_pb2.StorageLevel | None = ...,
     ) -> None: ...
     def HasField(
-        self, field_name: typing_extensions.Literal["relation", b"relation"]
+        self,
+        field_name: typing_extensions.Literal[
+            "_storage_level",
+            b"_storage_level",
+            "relation",
+            b"relation",
+            "storage_level",
+            b"storage_level",
+        ],
     ) -> builtins.bool: ...
     def ClearField(
         self,
         field_name: typing_extensions.Literal[
-            "eager", b"eager", "local", b"local", "relation", b"relation"
+            "_storage_level",
+            b"_storage_level",
+            "eager",
+            b"eager",
+            "local",
+            b"local",
+            "relation",
+            b"relation",
+            "storage_level",
+            b"storage_level",
         ],
     ) -> None: ...
+    def WhichOneof(
+        self, oneof_group: typing_extensions.Literal["_storage_level", b"_storage_level"]
+    ) -> typing_extensions.Literal["storage_level"] | None: ...
 
 global___CheckpointCommand = CheckpointCommand
 

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1015,7 +1015,9 @@ def checkpoint(self, eager: bool = True) -> "DataFrame":
         """
         ...
 
-    def localCheckpoint(self, eager: bool = True) -> "DataFrame":
+    def localCheckpoint(
+        self, eager: bool = True, storageLevel: Optional[StorageLevel] = None
+    ) -> "DataFrame":
         """Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can
         be used to truncate the logical plan of this :class:`DataFrame`, which is especially
         useful in iterative algorithms where the plan may grow exponentially. Local checkpoints
@@ -1026,12 +1028,17 @@ def localCheckpoint(self, eager: bool = True) -> "DataFrame":
 
         .. versionchanged:: 4.0.0
             Supports Spark Connect.
+            Added storageLevel parameter.
 
         Parameters
         ----------
         eager : bool, optional, default True
             Whether to checkpoint this :class:`DataFrame` immediately.
 
+        storageLevel : :class:`StorageLevel`, optional, default None
+            The StorageLevel with which the checkpoint will be stored.
+            If not specified, default for RDD local checkpoints.
+
         Returns
         -------
         :class:`DataFrame`

diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
@@ -951,11 +951,17 @@ def test_union_classmethod_usage(self):
     def test_isinstance_dataframe(self):
         self.assertIsInstance(self.spark.range(1), DataFrame)
 
-    def test_checkpoint_dataframe(self):
+    def test_local_checkpoint_dataframe(self):
         with io.StringIO() as buf, redirect_stdout(buf):
             self.spark.range(1).localCheckpoint().explain()
             self.assertIn("ExistingRDD", buf.getvalue())
 
+    def test_local_checkpoint_dataframe_with_storage_level(self):
+        # We don't have a way to reach into the server and assert the storage level server side, but
+        # this test should cover for unexpected errors in the API.
+        df = self.spark.range(10).localCheckpoint(eager=True, storageLevel=StorageLevel.DISK_ONLY)
+        df.collect()
+
     def test_transpose(self):
         df = self.spark.createDataFrame([{"a": "x", "b": "y", "c": "z"}])
 

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala b/sql/api/src/main/scala/org/apache/spark/sql/api/Dataset.scala
@@ -312,7 +312,8 @@ abstract class Dataset[T] extends Serializable {
    * @group basic
    * @since 2.1.0
    */
-  def checkpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = true)
+  def checkpoint(): Dataset[T] =
+    checkpoint(eager = true, reliableCheckpoint = true, storageLevel = None)
 
   /**
    * Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
@@ -332,7 +333,7 @@ abstract class Dataset[T] extends Serializable {
    * @since 2.1.0
    */
   def checkpoint(eager: Boolean): Dataset[T] =
-    checkpoint(eager = eager, reliableCheckpoint = true)
+    checkpoint(eager = eager, reliableCheckpoint = true, storageLevel = None)
 
   /**
    * Eagerly locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used
@@ -343,7 +344,8 @@ abstract class Dataset[T] extends Serializable {
    * @group basic
    * @since 2.3.0
    */
-  def localCheckpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = false)
+  def localCheckpoint(): Dataset[T] =
+    checkpoint(eager = true, reliableCheckpoint = false, storageLevel = None)
 
   /**
    * Locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used to
@@ -363,7 +365,29 @@ abstract class Dataset[T] extends Serializable {
    * @since 2.3.0
    */
   def localCheckpoint(eager: Boolean): Dataset[T] =
-    checkpoint(eager = eager, reliableCheckpoint = false)
+    checkpoint(eager = eager, reliableCheckpoint = false, storageLevel = None)
+
+  /**
+   * Locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used to
+   * truncate the logical plan of this Dataset, which is especially useful in iterative algorithms
+   * where the plan may grow exponentially. Local checkpoints are written to executor storage and
+   * despite potentially faster they are unreliable and may compromise job completion.
+   *
+   * @param eager
+   *   Whether to checkpoint this dataframe immediately
+   * @param storageLevel
+   *   StorageLevel with which to checkpoint the data.
+   * @note
+   *   When checkpoint is used with eager = false, the final data that is checkpointed after the
+   *   first action may be different from the data that was used during the job due to
+   *   non-determinism of the underlying operation and retries. If checkpoint is used to achieve
+   *   saving a deterministic snapshot of the data, eager = true should be used. Otherwise, it is
+   *   only deterministic after the first execution, after the checkpoint was finalized.
+   * @group basic
+   * @since 4.0.0
+   */
+  def localCheckpoint(eager: Boolean, storageLevel: StorageLevel): Dataset[T] =
+    checkpoint(eager = eager, reliableCheckpoint = false, storageLevel = Some(storageLevel))
 
   /**
    * Returns a checkpointed version of this Dataset.
@@ -373,8 +397,14 @@ abstract class Dataset[T] extends Serializable {
    * @param reliableCheckpoint
    *   Whether to create a reliable checkpoint saved to files inside the checkpoint directory. If
    *   false creates a local checkpoint using the caching subsystem
-   */
-  protected def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T]
+   * @param storageLevel
+   *   Option. If defined, StorageLevel with which to checkpoint the data. Only with
+   *   reliableCheckpoint = false.
+   */
+  protected def checkpoint(
+      eager: Boolean,
+      reliableCheckpoint: Boolean,
+      storageLevel: Option[StorageLevel]): Dataset[T]
 
   /**
    * Defines an event time watermark for this [[Dataset]]. A watermark tracks a point in time

diff --git a/sql/connect/common/src/main/protobuf/spark/connect/commands.proto b/sql/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -507,6 +507,9 @@ message CheckpointCommand {
 
   // (Required) Whether to checkpoint this dataframe immediately.
   bool eager = 3;
+
+  // (Optional) For local checkpoint, the storage level to use.
+  optional StorageLevel storage_level = 4;
 }
 
 message MergeIntoTableCommand {

diff --git a/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -3354,9 +3354,18 @@ class SparkConnectPlanner(
       responseObserver: StreamObserver[proto.ExecutePlanResponse]): Unit = {
     val target = Dataset
       .ofRows(session, transformRelation(checkpointCommand.getRelation))
-    val checkpointed = target.checkpoint(
-      eager = checkpointCommand.getEager,
-      reliableCheckpoint = !checkpointCommand.getLocal)
+    val checkpointed = if (checkpointCommand.getLocal) {
+      if (checkpointCommand.hasStorageLevel) {
+        target.localCheckpoint(
+          eager = checkpointCommand.getEager,
+          storageLevel =
+            StorageLevelProtoConverter.toStorageLevel(checkpointCommand.getStorageLevel))
+      } else {
+        target.localCheckpoint(eager = checkpointCommand.getEager)
+      }
+    } else {
+      target.checkpoint(eager = checkpointCommand.getEager)
+    }
 
     val dfId = UUID.randomUUID().toString
     logInfo(log"Caching DataFrame with id ${MDC(DATAFRAME_ID, dfId)}")

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -540,13 +540,18 @@ class Dataset[T] private[sql](
   def isStreaming: Boolean = logicalPlan.isStreaming
 
   /** @inheritdoc */
-  protected[sql] def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = {
+  protected[sql] def checkpoint(
+      eager: Boolean,
+      reliableCheckpoint: Boolean,
+      storageLevel: Option[StorageLevel]): Dataset[T] = {
     val actionName = if (reliableCheckpoint) "checkpoint" else "localCheckpoint"
     withAction(actionName, queryExecution) { physicalPlan =>
       val internalRdd = physicalPlan.execute().map(_.copy())
       if (reliableCheckpoint) {
+        assert(storageLevel.isEmpty, "StorageLevel should not be defined for reliableCheckpoint")
         internalRdd.checkpoint()
       } else {
+        storageLevel.foreach(storageLevel => internalRdd.persist(storageLevel))
         internalRdd.localCheckpoint()
       }
 
@@ -1810,6 +1815,10 @@ class Dataset[T] private[sql](
   /** @inheritdoc */
   override def localCheckpoint(eager: Boolean): Dataset[T] = super.localCheckpoint(eager)
 
+  /** @inheritdoc */
+  override def localCheckpoint(eager: Boolean, storageLevel: StorageLevel): Dataset[T] =
+    super.localCheckpoint(eager, storageLevel)
+
   /** @inheritdoc */
   override def joinWith[U](other: Dataset[U], condition: Column): Dataset[(T, U)] =
     super.joinWith(other, condition)