apache · jingz-db · Mar 8, 2024 · Mar 8, 2024 · Mar 9, 2024 · Mar 9, 2024
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.streaming
 import java.io.Serializable
 
 import org.apache.spark.annotation.{Evolving, Experimental}
+import org.apache.spark.sql.Encoder
 
 /**
  * Represents the operation handle provided to the stateful processor used in the
@@ -33,20 +34,22 @@ private[sql] trait StatefulProcessorHandle extends Serializable {
    * The user must ensure to call this function only within the `init()` method of the
    * StatefulProcessor.
    * @param stateName - name of the state variable
+   * @param valEncoder - SQL encoder for state variable
    * @tparam T - type of state variable
    * @return - instance of ValueState of type T that can be used to store state persistently
    */
-  def getValueState[T](stateName: String): ValueState[T]
+  def getValueState[T](stateName: String, valEncoder: Encoder[T]): ValueState[T]
 
   /**
    * Creates new or returns existing list state associated with stateName.
    * The ListState persists values of type T.
    *
    * @param stateName  - name of the state variable
+   * @param valEncoder - SQL encoder for state variable
    * @tparam T - type of state variable
    * @return - instance of ListState of type T that can be used to store state persistently
    */
-  def getListState[T](stateName: String): ListState[T]
+  def getListState[T](stateName: String, valEncoder: Encoder[T]): ListState[T]
 
   /** Function to return queryInfo for currently running task */
   def getQueryInfo(): QueryInfo

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImpl.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.streaming.StateKeyValueRowSchema.{KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA}
 import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreErrors}
@@ -28,17 +29,20 @@ import org.apache.spark.sql.streaming.ListState
  *
  * @param store - reference to the StateStore instance to be used for storing state
  * @param stateName - name of logical state partition
+ * @param keyEnc - Spark SQL encoder for key
+ * @param valEncoder - Spark SQL encoder for value
  * @tparam S - data type of object that will be stored in the list
  */
 class ListStateImpl[S](
      store: StateStore,
      stateName: String,
-     keyExprEnc: ExpressionEncoder[Any])
+     keyExprEnc: ExpressionEncoder[Any],
+     valEncoder: Encoder[S])
   extends ListState[S] with Logging {
 
   private val keySerializer = keyExprEnc.createSerializer()
 
-  private val stateTypesEncoder = StateTypesEncoder(keySerializer, stateName)
+  private val stateTypesEncoder = StateTypesEncoder(keySerializer, valEncoder, stateName)
 
   store.createColFamilyIfAbsent(stateName, KEY_ROW_SCHEMA, numColsPrefixKey = 0,
     VALUE_ROW_SCHEMA, useMultipleValuesPerKey = true)

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.commons.lang3.SerializationUtils
-
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.Serializer
+import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.execution.streaming.state.StateStoreErrors
 import org.apache.spark.sql.types.{BinaryType, StructType}
@@ -41,17 +41,27 @@ object StateKeyValueRowSchema {
  *
  * @param keySerializer - serializer to serialize the grouping key of type `GK`
  *     to an [[InternalRow]]
+ * @param valEncoder - SQL encoder for value of type `S`
  * @param stateName - name of logical state partition
  * @tparam GK - grouping key type
+ * @tparam V - value type
  */
-class StateTypesEncoder[GK](
+class StateTypesEncoder[GK, V](
     keySerializer: Serializer[GK],
+    valEncoder: Encoder[V],
     stateName: String) {
   import org.apache.spark.sql.execution.streaming.StateKeyValueRowSchema._
 
+  /** Variables reused for conversions between byte array and UnsafeRow */
   private val keyProjection = UnsafeProjection.create(KEY_ROW_SCHEMA)
   private val valueProjection = UnsafeProjection.create(VALUE_ROW_SCHEMA)
 
+  /** Variables reused for value conversions between spark sql and object */
+  private val valExpressionEnc = encoderFor(valEncoder)
+  private val objToRowSerializer = valExpressionEnc.createSerializer()
+  private val rowToObjDeserializer = valExpressionEnc.resolveAndBind().createDeserializer()
+  private val reuseRow = new UnsafeRow(valEncoder.schema.fields.length)
+
   // TODO: validate places that are trying to encode the key and check if we can eliminate/
   // add caching for some of these calls.
   def encodeGroupingKey(): UnsafeRow = {
@@ -66,23 +76,26 @@ class StateTypesEncoder[GK](
     keyRow
   }
 
-  def encodeValue[S](value: S): UnsafeRow = {
-    val valueByteArr = SerializationUtils.serialize(value.asInstanceOf[Serializable])
-    val valueRow = valueProjection(InternalRow(valueByteArr))
-    valueRow
+  def encodeValue(value: V): UnsafeRow = {
+    val objRow: InternalRow = objToRowSerializer.apply(value)
+    val bytes = objRow.asInstanceOf[UnsafeRow].getBytes()
+    val valRow = valueProjection(InternalRow(bytes))
+    valRow
   }
 
-  def decodeValue[S](row: UnsafeRow): S = {
-    SerializationUtils
-      .deserialize(row.getBinary(0))
-      .asInstanceOf[S]
+  def decodeValue(row: UnsafeRow): V = {
+    val bytes = row.getBinary(0)
+    reuseRow.pointTo(bytes, bytes.length)
+    val value = rowToObjDeserializer.apply(reuseRow)
+    value
   }
 }
 
 object StateTypesEncoder {
-  def apply[GK](
+  def apply[GK, V](
       keySerializer: Serializer[GK],
-      stateName: String): StateTypesEncoder[GK] = {
-    new StateTypesEncoder[GK](keySerializer, stateName)
+      valEncoder: Encoder[V],
+      stateName: String): StateTypesEncoder[GK, V] = {
+    new StateTypesEncoder[GK, V](keySerializer, valEncoder, stateName)
   }
 }
diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala
@@ -20,6 +20,7 @@ import java.util.UUID
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.{ListState, QueryInfo, StatefulProcessorHandle, ValueState}
@@ -112,10 +113,10 @@ class StatefulProcessorHandleImpl(
 
   def getHandleState: StatefulProcessorHandleState = currState
 
-  override def getValueState[T](stateName: String): ValueState[T] = {
+  override def getValueState[T](stateName: String, valEncoder: Encoder[T]): ValueState[T] = {
     verify(currState == CREATED, s"Cannot create state variable with name=$stateName after " +
       "initialization is complete")
-    val resultState = new ValueStateImpl[T](store, stateName, keyEncoder)
+    val resultState = new ValueStateImpl[T](store, stateName, keyEncoder, valEncoder)
     resultState
   }
 
@@ -132,10 +133,10 @@ class StatefulProcessorHandleImpl(
     store.removeColFamilyIfExists(stateName)
   }
 
-  override def getListState[T](stateName: String): ListState[T] = {
+  override def getListState[T](stateName: String, valEncoder: Encoder[T]): ListState[T] = {
     verify(currState == CREATED, s"Cannot create state variable with name=$stateName after " +
       "initialization is complete")
-    val resultState = new ListStateImpl[T](store, stateName, keyEncoder)
+    val resultState = new ListStateImpl[T](store, stateName, keyEncoder, valEncoder)
     resultState
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ValueStateImpl.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.streaming.StateKeyValueRowSchema.{KEY_ROW_SCHEMA, VALUE_ROW_SCHEMA}
@@ -29,16 +30,18 @@ import org.apache.spark.sql.streaming.ValueState
  * @param store - reference to the StateStore instance to be used for storing state
  * @param stateName - name of logical state partition
  * @param keyEnc - Spark SQL encoder for key
+ * @param valEncoder - Spark SQL encoder for value
  * @tparam S - data type of object that will be stored
  */
 class ValueStateImpl[S](
     store: StateStore,
     stateName: String,
-    keyExprEnc: ExpressionEncoder[Any]) extends ValueState[S] with Logging {
+    keyExprEnc: ExpressionEncoder[Any],
+    valEncoder: Encoder[S]) extends ValueState[S] with Logging {
 
   private val keySerializer = keyExprEnc.createSerializer()
 
-  private val stateTypesEncoder = StateTypesEncoder(keySerializer, stateName)
+  private val stateTypesEncoder = StateTypesEncoder(keySerializer, valEncoder, stateName)
 
   store.createColFamilyIfAbsent(stateName, KEY_ROW_SCHEMA, numColsPrefixKey = 0,
     VALUE_ROW_SCHEMA)
@@ -57,7 +60,7 @@ class ValueStateImpl[S](
   override def get(): S = {
     val retRow = getImpl()
     if (retRow != null) {
-      stateTypesEncoder.decodeValue[S](retRow)
+      stateTypesEncoder.decodeValue(retRow)
     } else {
       null.asInstanceOf[S]
     }

diff --git a/sql/core/src/test/java/org/apache/spark/sql/execution/streaming/state/POJOTestClass.java b/sql/core/src/test/java/org/apache/spark/sql/execution/streaming/state/POJOTestClass.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.state;
+
+/**
+ * A POJO class used for tests of arbitrary state SQL encoder.
+ */
+public class POJOTestClass {
+  // Fields
+  private String name;
+  private int id;
+
+  // Constructors
+  public POJOTestClass() {
+    // Default constructor
+  }
+
+  public POJOTestClass(String name, int id) {
+    this.name = name;
+    this.id = id;
+  }
+
+  // Getter methods
+  public String getName() {
+    return name;
+  }
+
+  public int getId() {
+    return id;
+  }
+
+  // Setter methods
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public void setId(int id) {
+    this.id = id;
+  }
+
+  // Additional methods if needed
+  public void incrementId() {
+    id++;
+    System.out.println(name + " is now " + id + "!");
+  }
+
+  // Override toString for better representation
+  @Override
+  public String toString() {
+    return "POJOTestClass{" +
+      "name='" + name + '\'' +
+      ", age=" + id +
+      '}';
+  }
+
+  // Override equals and hashCode for custom equality
+  @Override
+  public boolean equals(Object obj) {
+    POJOTestClass testObj = (POJOTestClass) obj;
+    return id == testObj.id && name.equals(testObj.name);
+  }
+}
+