implementing transform function in Python

YanTangZhai · Aug 18, 2014 · dcf243f · dcf243f
1 parent 9af03f4
commit dcf243f
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 2 deletions.
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
@@ -172,7 +172,6 @@ def _mergeCombiners(iterator):
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners) 
 
-
     def partitionBy(self, numPartitions, partitionFunc=None):
         """
         Return a copy of the DStream partitioned using the specified partitioner.
@@ -231,6 +230,7 @@ def slice(self, fromTime, toTime):
     def transform(self, transformFunc):
         """
         """
+        self._jdstream.transform(transformFunc)
         raise NotImplementedError
 
     def transformWith(self, other, transformFunc):
@@ -264,7 +264,6 @@ def _defaultReducePartitions(self):
 
         """
         # hard code to avoid the error
-        return 2
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:

diff --git a/...aming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala b/...aming/src/main/scala/org/apache/spark/streaming/api/python/PythonTransformedDStream.scala
@@ -0,0 +1,37 @@
+package org.apache.spark.streaming.api.python
+
+import org.apache.spark.Accumulator
+import org.apache.spark.api.python.PythonRDD
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.api.java.JavaDStream
+import org.apache.spark.streaming.{Time, Duration}
+import org.apache.spark.streaming.dstream.DStream
+
+import scala.reflect.ClassTag
+
+/**
+ * Created by ken on 7/15/14.
+ */
+class PythonTransformedDStream[T: ClassTag](
+               parents: Seq[DStream[T]],
+               command: Array[Byte],
+               envVars: JMap[String, String],
+               pythonIncludes: JList[String],
+               preservePartitoning: Boolean,
+               pythonExec: String,
+               broadcastVars: JList[Broadcast[Array[Byte]]],
+               accumulator: Accumulator[JList[Array[Byte]]]
+               ) extends DStream[Array[Byte]](parent.ssc) {
+
+  override def dependencies = List(parent)
+
+  override def slideDuration: Duration = parent.slideDuration
+
+  //pythonDStream compute
+  override def compute(validTime: Time): Option[RDD[Array[Byte]]] = {
+    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    Some()
+  }
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -561,9 +561,12 @@ abstract class DStream[T: ClassTag] (
     // because the DStream is reachable from the outer object here, and because 
     // DStreams can't be serialized with closures, we can't proactively check 
     // it for serializability and so we pass the optional false to SparkContext.clean
+
+    // serialized python
     val cleanedF = context.sparkContext.clean(transformFunc, false)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
+      // if transformfunc is fine, it is okay
       cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
     }
     new TransformedDStream[U](Seq(this), realTransformFunc)