apache · jingz-db · Feb 29, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/MapState.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/MapState.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.streaming
+
+import org.apache.spark.annotation.{Evolving, Experimental}
+
+@Experimental
+@Evolving
+/**
+ * Interface used for arbitrary stateful operations with the v2 API to capture
+ * map value state.
+ */
+trait MapState[K, V] extends Serializable {
+  /** Whether state exists or not. */
+  def exists(): Boolean
+
+  /** Get the state value if it exists */
+  def getValue(key: K): V
+
+  /** Check if the user key is contained in the map */
+  def containsKey(key: K): Boolean
+
+  /** Update value for given user key */
+  def updateValue(key: K, value: V) : Unit
+
+  /** Get the map associated with grouping key */
+  def getMap(): Map[K, V]
+
+  /** Get the list of keys present in map associated with grouping key */
+  def getKeys(): Iterator[K]
+
+  /** Get the list of values present in map associated with grouping key */
+  def getValues(): Iterator[V]
+
+  /** Remove user key from map state */
+  def removeKey(key: K): Unit
+
+  /** Remove this state. */
+  def clear(): Unit
+}
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.streaming
 import java.io.Serializable
 
 import org.apache.spark.annotation.{Evolving, Experimental}
+import org.apache.spark.sql.Encoder
 
 /**
  * Represents the operation handle provided to the stateful processor used in the
@@ -48,6 +49,18 @@ private[sql] trait StatefulProcessorHandle extends Serializable {
    */
   def getListState[T](stateName: String): ListState[T]
 
+  /**
+   * Creates new or returns existing map state associated with stateName.
+   * The MapState persists Key-Value pairs of type [K, V].
+   *
+   * @param stateName  - name of the state variable
+   * @param userKeyEnc  - spark sql encoder for the map key
+   * @tparam K - type of key for map state variable
+   * @tparam V - type of value for map state variable
+   * @return - instance of MapState of type [K,V] that can be used to store state persistently
+   */
+  def getMapState[K, V](stateName: String, userKeyEnc: Encoder[K]): MapState[K, V]
+
   /** Function to return queryInfo for currently running task */
   def getQueryInfo(): QueryInfo
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImpl.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Encoder
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreErrors, UnsafeRowPair}
+import org.apache.spark.sql.streaming.MapState
+import org.apache.spark.sql.types.{BinaryType, StructType}
+
+class MapStateImpl[K, V](
+    store: StateStore,
+    stateName: String,
+    keyExprEnc: ExpressionEncoder[Any],
+    userKeyExprEnc: Encoder[K]) extends MapState[K, V] with Logging {
+
+  // Pack grouping key and user key together as a prefixed composite key
+  private val schemaForCompositeKeyRow: StructType =
+    new StructType()
+    .add("key", BinaryType)
+    .add("userKey", BinaryType)
+  private val schemaForValueRow: StructType = new StructType().add("value", BinaryType)
+  private val keySerializer = keyExprEnc.createSerializer()
+  private val stateTypesEncoder = CompositeKeyStateEncoder(
+    keySerializer, schemaForCompositeKeyRow, stateName, userKeyExprEnc)
+
+  store.createColFamilyIfAbsent(stateName, schemaForCompositeKeyRow, numColsPrefixKey = 1,
+    schemaForValueRow)
+
+  /** Whether state exists or not. */
+  override def exists(): Boolean = {
+    !store.prefixScan(stateTypesEncoder.encodeGroupingKey(), stateName).isEmpty
+  }
+
+  /** Get the state value if it exists */
+  override def getValue(key: K): V = {
+    StateStoreErrors.requireNonNullStateValue(key, stateName)
+    val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(key)
+    val unsafeRowValue = store.get(encodedCompositeKey, stateName)
+
+    if (unsafeRowValue == null) return null.asInstanceOf[V]
+    stateTypesEncoder.decodeValue(unsafeRowValue)
+  }
+
+  /** Check if the user key is contained in the map */
+  override def containsKey(key: K): Boolean = {
+    StateStoreErrors.requireNonNullStateValue(key, stateName)
+    getValue(key) != null
+  }
+
+  /** Update value for given user key */
+  override def updateValue(key: K, value: V): Unit = {
+    StateStoreErrors.requireNonNullStateValue(key, stateName)
+    StateStoreErrors.requireNonNullStateValue(value, stateName)
+    val encodedValue = stateTypesEncoder.encodeValue(value)
+    val encodedCompositeKey = stateTypesEncoder.encodeCompositeKey(key)
+    store.put(encodedCompositeKey, encodedValue, stateName)
+  }
+
+  /** Get the map associated with grouping key */
+  override def getMap(): Map[K, V] = {
+    val encodedGroupingKey = stateTypesEncoder.encodeGroupingKey()
+    store.prefixScan(encodedGroupingKey, stateName)
+      .map {
+        case iter: UnsafeRowPair =>
+          (stateTypesEncoder.decodeCompositeKey(iter.key),
+            stateTypesEncoder.decodeValue(iter.value))
+      }.toMap
+  }
+
+  /** Get the list of keys present in map associated with grouping key */
+  override def getKeys(): Iterator[K] = {
+    getMap().keys.iterator
+  }
+
+  /** Get the list of values present in map associated with grouping key */
+  override def getValues(): Iterator[V] = {
+    getMap().values.iterator
+  }
+
+  /** Remove user key from map state */
+  override def removeKey(key: K): Unit = {
+    StateStoreErrors.requireNonNullStateValue(key, stateName)
+    val compositeKey = stateTypesEncoder.encodeCompositeKey(key)
+    store.remove(compositeKey, stateName)
+  }
+
+  /** Remove this state. */
+  override def clear(): Unit = {
+    getKeys().foreach { itr =>
+      removeKey(itr)
+    }
+  }
+}
diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala
@@ -19,8 +19,10 @@ package org.apache.spark.sql.execution.streaming
 
 import org.apache.commons.lang3.SerializationUtils
 
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.Serializer
+import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.execution.streaming.state.StateStoreErrors
 import org.apache.spark.sql.types.{BinaryType, StructType}
@@ -86,3 +88,60 @@ object StateTypesEncoder {
     new StateTypesEncoder[GK](keySerializer, stateName)
   }
 }
+
+class CompositeKeyStateEncoder[GK, K](
+    keySerializer: Serializer[GK],
+    schemaForCompositeKeyRow: StructType,
+    stateName: String,
+    userKeyEnc: Encoder[K])
+  extends StateTypesEncoder[GK](keySerializer: Serializer[GK], stateName: String) {
+
+  private val compositeKeyProjection = UnsafeProjection.create(schemaForCompositeKeyRow)
+  private val reuseRow = new UnsafeRow(userKeyEnc.schema.fields.length)
+  private val userKeyExpressionEnc = encoderFor(userKeyEnc)
+
+  private val userKeyRowToObjDeserializer =
+    userKeyExpressionEnc.resolveAndBind().createDeserializer()
+  private val userKeySerializer = encoderFor(userKeyEnc).createSerializer()
+
+  /**
+   * Grouping key and user key are encoded as a row of `schemaForCompositeKeyRow` schema.
+   * Grouping key will be encoded in `RocksDBStateEncoder` as the prefix column.
+   */
+  def encodeCompositeKey(userKey: K): UnsafeRow = {
+    val keyOption = ImplicitGroupingKeyTracker.getImplicitKeyOption
+    if (keyOption.isEmpty) {
+      throw StateStoreErrors.implicitKeyNotFound(stateName)
+    }
+    val groupingKey = keyOption.get.asInstanceOf[GK]
+    // generate grouping key byte array
+    val groupingKeyByteArr = keySerializer.apply(groupingKey).asInstanceOf[UnsafeRow].getBytes()
+    // generate user key byte array
+    val userKeyBytesArr = userKeySerializer.apply(userKey).asInstanceOf[UnsafeRow].getBytes()
+
+    val compositeKeyRow = compositeKeyProjection(InternalRow(groupingKeyByteArr, userKeyBytesArr))
+    compositeKeyRow
+  }
+
+  /**
+   * The input row is of composite Key schema.
+   * Only user key is returned though grouping key also exist in the row.
+   */
+  def decodeCompositeKey(row: UnsafeRow): K = {
+    val bytes = row.getBinary(1)
+    reuseRow.pointTo(bytes, bytes.length)
+    val userKey = userKeyRowToObjDeserializer.apply(reuseRow)
+    userKey
+  }
+}
+
+object CompositeKeyStateEncoder {
+  def apply[GK, K](
+      keySerializer: Serializer[GK],
+      schemaForCompositeKeyRow: StructType,
+      stateName: String,
+      userKeyEnc: Encoder[K]): CompositeKeyStateEncoder[GK, K] = {
+    new CompositeKeyStateEncoder[GK, K](
+      keySerializer, schemaForCompositeKeyRow, stateName, userKeyEnc)
+  }
+}
diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala
@@ -20,9 +20,10 @@ import java.util.UUID
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.streaming.state.StateStore
-import org.apache.spark.sql.streaming.{ListState, QueryInfo, StatefulProcessorHandle, ValueState}
+import org.apache.spark.sql.streaming.{ListState, MapState, QueryInfo, StatefulProcessorHandle, ValueState}
 import org.apache.spark.util.Utils
 
 /**
@@ -138,4 +139,11 @@ class StatefulProcessorHandleImpl(
     val resultState = new ListStateImpl[T](store, stateName, keyEncoder)
     resultState
   }
+
+  override def getMapState[K, V](stateName: String, userKeyEnc: Encoder[K]): MapState[K, V] = {
+    verify(currState == CREATED, s"Cannot create state variable with name=$stateName after " +
+      "initialization is complete")
+    val resultState = new MapStateImpl[K, V](store, stateName, keyEncoder, userKeyEnc)
+    resultState
+  }
 }