apache · zsxwing · Feb 2, 2016 · Feb 3, 2016 · Feb 4, 2016 · Feb 4, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataStreamReader.scala
@@ -100,6 +100,7 @@ class DataStreamReader private[sql](sqlContext: SQLContext) extends Logging {
     val resolved = ResolvedDataSource.createSource(
       sqlContext,
       userSpecifiedSchema = userSpecifiedSchema,
+      partitionColumns = Array.empty[String],
       providerName = source,
       options = extraOptions.toMap)
     DataFrame(sqlContext, StreamingRelation(resolved))

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -95,6 +95,7 @@ object ResolvedDataSource extends Logging {
   def createSource(
       sqlContext: SQLContext,
       userSpecifiedSchema: Option[StructType],
+      partitionColumns: Array[String],
       providerName: String,
       options: Map[String, String]): Source = {
     val provider = lookupDataSource(providerName).newInstance() match {
@@ -104,7 +105,20 @@ object ResolvedDataSource extends Logging {
           s"Data source $providerName does not support streamed reading")
     }
 
-    provider.createSource(sqlContext, options, userSpecifiedSchema)
+    userSpecifiedSchema match {
+      case Some(schema) => {
+        val maybePartitionsSchema = if (partitionColumns.isEmpty) {
+          None
+        } else {
+          Some(partitionColumnsSchema(
+            schema, partitionColumns, sqlContext.conf.caseSensitiveAnalysis))
+        }
+        val dataSchema =
+          StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable
+        provider.createSource(sqlContext, Some(dataSchema), maybePartitionsSchema, options)
+      }
+      case None => provider.createSource(sqlContext, None, None, options)
+    }
   }
 
   def createSink(

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.nio.ByteBuffer
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.Logging
+import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.sql.SQLContext
+
+/**
+ * A very simple sink that stores received data on the filesystem as a text file.
+ * This is not atomic.
+ */
+class FileStreamSink(
+    sqlContext: SQLContext,
+    metadataPath: String,
+    path: String) extends Sink with Logging {
+
+  private def sparkContext = sqlContext.sparkContext
+  private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
+  private val serializer = new JavaSerializer(sqlContext.sparkContext.conf).newInstance()
+
+  override def currentOffset: Option[Offset] = {
+    try {
+      val buffer = new Array[Byte](10240)
+      val stream = fs.open(new Path(metadataPath))
+      val size = stream.read(buffer)
+      val shrunk = ByteBuffer.wrap(buffer.take(size))
+      Some(serializer.deserialize[Offset](shrunk))
+    } catch {
+      case _: java.io.FileNotFoundException =>
+        None
+    }
+  }
+
+  // TODO: this is not atomic.
+  override def addBatch(batch: Batch): Unit = {
+    batch.data.write.mode("append").text(path)
+    val offset = serializer.serialize(batch.end)
+    val stream = fs.create(new Path(metadataPath), true)
+    stream.write(offset.array())
+    stream.close()
+    logInfo(s"Committed batch ${batch.end}")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.{BufferedWriter, OutputStreamWriter}
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.util.collection.OpenHashSet
+
+/**
+ * A very simple source that reads text files from the given directory as they appear.
+ */
+class FileStreamSource(
+    sqlContext: SQLContext,
+    metadataPath: String,
+    path: String,
+    dataSchema: Option[StructType],
+    dataFrameBuilder: Array[String] => DataFrame) extends Source with Logging {
+
+  import sqlContext.implicits._
+
+  /** Returns the schema of the data from this source */
+  override def schema: StructType = dataSchema.getOrElse(new StructType().add("value", StringType))
+
+  /** Returns the maximum offset that can be retrieved from the source. */
+  def fetchMaxOffset(): LongOffset = synchronized {
+    val filesPresent = fetchAllFiles()
+    val newFiles = new ArrayBuffer[String]()
+    filesPresent.foreach { file =>
+      if (!seenFiles.contains(file)) {
+        logDebug(s"new file: $file")
+        newFiles.append(file)
+        seenFiles.add(file)
+      } else {
+        logDebug(s"old file: $file")
+      }
+    }
+
+    if (newFiles.nonEmpty) {
+      maxBatchFile += 1
+      writeBatch(maxBatchFile, newFiles)
+    }
+
+    new LongOffset(maxBatchFile)
+  }
+
+  /**
+   * Returns the next batch of data that is available after `start`, if any is available.
+   */
+  override def getNextBatch(start: Option[Offset]): Option[Batch] = {
+    val startId = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(0L)
+    val end = fetchMaxOffset()
+    val endId = end.offset
+
+    val batchFiles = (startId to endId).filter(_ >= 0).map(i => s"$metadataPath/$i")
+    if (!(batchFiles.isEmpty || start == Some(end))) {
+      logDebug(s"Producing files from batches $start:$endId")
+      logDebug(s"Batch files: $batchFiles")
+
+      // Probably does not need to be a spark job...
+      val files = sqlContext
+          .read
+          .text(batchFiles: _*)
+          .as[String]
+          .collect()
+      logDebug(s"Streaming ${files.mkString(", ")}")
+      Some(new Batch(end, dataFrameBuilder(files)))
+    } else {
+      None
+    }
+  }
+
+  def restart(): FileStreamSource = {
+    new FileStreamSource(sqlContext, metadataPath, path, dataSchema, dataFrameBuilder)
+  }
+
+  private def sparkContext = sqlContext.sparkContext
+
+  private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
+  private val existingBatchFiles = fetchAllBatchFiles()
+  private val existingBatchIds = existingBatchFiles.map(_.getPath.getName.toInt)
+  private var maxBatchFile = if (existingBatchIds.isEmpty) -1 else existingBatchIds.max
+  private val seenFiles = new OpenHashSet[String]
+
+  if (existingBatchFiles.nonEmpty) {
+    sqlContext.read
+        .text(existingBatchFiles.map(_.getPath.toString): _*)
+        .as[String]
+        .collect()
+        .foreach { file =>
+          seenFiles.add(file)
+        }
+  }
+
+  private def fetchAllBatchFiles(): Seq[FileStatus] = {
+    try fs.listStatus(new Path(metadataPath)) catch {
+      case _: java.io.FileNotFoundException =>
+        fs.mkdirs(new Path(metadataPath))
+        Seq.empty
+    }
+  }
+
+  private def fetchAllFiles(): Seq[String] = {
+    fs.listStatus(new Path(path))
+      .filterNot(_.getPath.getName.startsWith("_"))
+      .map(_.getPath.toUri.toString)
+  }
+
+  private def writeBatch(id: Int, files: Seq[String]): Unit = {
+    val path = new Path(metadataPath + "/" + id)
+    val fs = FileSystem.get(sparkContext.hadoopConfiguration)
+    val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)))
+    files.foreach { file =>
+      writer.write(file)
+      writer.write("\n")
+    }
+    writer.close()
+    logDebug(s"Wrote batch file $path")
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -173,6 +173,23 @@ class StreamExecution(
     logDebug(s"Waiting for data, current: $streamProgress")
   }
 
+  /** Clears the indicator that a batch has completed.  Used for testing. */
+  def clearBatchMarker(): Unit = {
+    batchRun = false
+  }
+
+  /**
+   * Awaits the completion of at least one streaming batch. Must be called after `clearBatchMarker`
+   * to guarantee that a new batch has been processed.
+   */
+  def awaitBatchCompletion(): Unit = {
+    while (!batchRun) {
+      awaitBatchLock.synchronized {
+        awaitBatchLock.wait(100)
+      }
+    }
+  }
+
   /**
    * Signals to the thread executing micro-batches that it should stop running after the next
    * batch. This method blocks until the thread stops running.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.execution.{FileRelation, RDDConversions}
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.execution.streaming.{FileStreamSink, FileStreamSource, Sink, Source}
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.util.SerializableConfiguration
 
@@ -130,8 +130,9 @@ trait SchemaRelationProvider {
 trait StreamSourceProvider {
   def createSource(
       sqlContext: SQLContext,
-      parameters: Map[String, String],
-      schema: Option[StructType]): Source
+      partitionColumns: Option[StructType],
+      dataSchema: Option[StructType],
+      parameters: Map[String, String]): Source
 }
 
 /**
@@ -168,7 +169,7 @@ trait StreamSinkProvider {
  * @since 1.4.0
  */
 @Experimental
-trait HadoopFsRelationProvider {
+trait HadoopFsRelationProvider extends StreamSourceProvider with StreamSinkProvider {
   /**
    * Returns a new base relation with the given parameters, a user defined schema, and a list of
    * partition columns. Note: the parameters' keywords are case insensitive and this insensitivity
@@ -195,6 +196,38 @@ trait HadoopFsRelationProvider {
     }
     createRelation(sqlContext, paths, dataSchema, partitionColumns, parameters)
   }
+
+  override def createSource(
+      sqlContext: SQLContext,
+      partitionColumns: Option[StructType],
+      dataSchema: Option[StructType],
+      parameters: Map[String, String]): Source = {
+    val path = parameters("path")
+    val metadataPath = parameters.getOrElse("metadataPath", s"$path/_metadata")
+
+    def dataFrameBuilder(files: Array[String]): DataFrame = {
+      val relation = createRelation(
+        sqlContext,
+        files,
+        dataSchema,
+        partitionColumns,
+        bucketSpec = None,
+        parameters)
+      DataFrame(sqlContext, LogicalRelation(relation))
+    }
+
+    new FileStreamSource(sqlContext, metadataPath, path, dataSchema, dataFrameBuilder)
+  }
+
+  override def createSink(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      partitionColumns: Seq[String]): Sink = {
+    val path = parameters("path")
+    val metadataPath = parameters.getOrElse("metadataPath", s"$path/_metadata")
+
+    new FileStreamSink(sqlContext, metadataPath, path)
+  }
 }
 
 /**

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DataStreamReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DataStreamReaderSuite.scala
@@ -33,10 +33,11 @@ object LastOptions {
 class DefaultSource extends StreamSourceProvider with StreamSinkProvider {
   override def createSource(
       sqlContext: SQLContext,
-      parameters: Map[String, String],
-      schema: Option[StructType]): Source = {
+      partitionColumns: Option[StructType],
+      dataSchema: Option[StructType],
+      parameters: Map[String, String]): Source = {
     LastOptions.parameters = parameters
-    LastOptions.schema = schema
+    LastOptions.schema = dataSchema
     new Source {
       override def getNextBatch(start: Option[Offset]): Option[Batch] = None
       override def schema: StructType = StructType(StructField("a", IntegerType) :: Nil)