-
Notifications
You must be signed in to change notification settings - Fork 28.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-13149][SQL]Add FileStreamSource #11034
Changes from 5 commits
a2784ff
6a90c55
9f5967f
937b86f
2af6fc8
f38153b
1ccea0f
ca1e6b8
d911963
ce0556d
9fd21c2
9a1042c
91a2b74
1ffee5f
fb0e3f9
07e2ddd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.execution.streaming | ||
|
||
import java.io.{BufferedWriter, OutputStreamWriter} | ||
|
||
import scala.collection.mutable.ArrayBuffer | ||
|
||
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} | ||
|
||
import org.apache.spark.Logging | ||
import org.apache.spark.sql.{DataFrame, SQLContext} | ||
import org.apache.spark.sql.types.{StringType, StructType} | ||
import org.apache.spark.util.collection.OpenHashSet | ||
|
||
/** | ||
* A very simple source that reads text files from the given directory as they appear. | ||
*/ | ||
class FileStreamSource( | ||
sqlContext: SQLContext, | ||
metadataPath: String, | ||
path: String, | ||
dataSchema: Option[StructType], | ||
dataFrameBuilder: Array[String] => DataFrame) extends Source with Logging { | ||
|
||
import sqlContext.implicits._ | ||
|
||
/** Returns the schema of the data from this source */ | ||
override lazy val schema: StructType = { | ||
dataSchema.getOrElse { | ||
val filesPresent = fetchAllFiles() | ||
if (filesPresent.isEmpty) { | ||
new StructType().add("value", StringType) | ||
} else { | ||
// There are some existing files. Use them to infer the schema | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "." at the end. |
||
dataFrameBuilder(filesPresent.toArray).schema | ||
} | ||
} | ||
} | ||
|
||
/** Returns the maximum offset that can be retrieved from the source. */ | ||
def fetchMaxOffset(): LongOffset = synchronized { | ||
val filesPresent = fetchAllFiles() | ||
val newFiles = new ArrayBuffer[String]() | ||
filesPresent.foreach { file => | ||
if (!seenFiles.contains(file)) { | ||
logDebug(s"new file: $file") | ||
newFiles.append(file) | ||
seenFiles.add(file) | ||
} else { | ||
logDebug(s"old file: $file") | ||
} | ||
} | ||
|
||
if (newFiles.nonEmpty) { | ||
maxBatchFile += 1 | ||
writeBatch(maxBatchFile, newFiles) | ||
} | ||
|
||
new LongOffset(maxBatchFile) | ||
} | ||
|
||
def currentOffset: LongOffset = synchronized { | ||
new LongOffset(maxBatchFile) | ||
} | ||
|
||
/** | ||
* Returns the next batch of data that is available after `start`, if any is available. | ||
*/ | ||
override def getNextBatch(start: Option[Offset]): Option[Batch] = { | ||
val startId = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(-1L) | ||
val end = fetchMaxOffset() | ||
val endId = end.offset | ||
|
||
val batchFiles = (startId + 1 to endId).filter(_ >= 0).map(i => s"$metadataPath/$i") | ||
if (batchFiles.nonEmpty) { | ||
logDebug(s"Producing files from batches ${startId + 1}:$endId") | ||
logDebug(s"Batch files: $batchFiles") | ||
|
||
// Probably does not need to be a spark job... | ||
val files = sqlContext | ||
.read | ||
.text(batchFiles: _*) | ||
.as[String] | ||
.collect() | ||
logDebug(s"Streaming ${files.mkString(", ")}") | ||
Some(new Batch(end, dataFrameBuilder(files))) | ||
} else { | ||
None | ||
} | ||
} | ||
|
||
private def sparkContext = sqlContext.sparkContext | ||
|
||
private val fs = FileSystem.get(sparkContext.hadoopConfiguration) | ||
private val existingBatchFiles = fetchAllBatchFiles() | ||
private val existingBatchIds = existingBatchFiles.map(_.getPath.getName.toInt) | ||
private var maxBatchFile = if (existingBatchIds.isEmpty) -1 else existingBatchIds.max | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Isnt this |
||
private val seenFiles = new OpenHashSet[String] | ||
|
||
if (existingBatchFiles.nonEmpty) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some comments on what this Spark job is for? |
||
sqlContext.read | ||
.text(existingBatchFiles.map(_.getPath.toString): _*) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldnt these be two indented rather than four indented? |
||
.as[String] | ||
.collect() | ||
.foreach { file => | ||
seenFiles.add(file) | ||
} | ||
} | ||
|
||
private def fetchAllBatchFiles(): Seq[FileStatus] = { | ||
try fs.listStatus(new Path(metadataPath)) catch { | ||
case _: java.io.FileNotFoundException => | ||
fs.mkdirs(new Path(metadataPath)) | ||
Seq.empty | ||
} | ||
} | ||
|
||
private def fetchAllFiles(): Seq[String] = { | ||
fs.listStatus(new Path(path)) | ||
.filterNot(_.getPath.getName.startsWith("_")) | ||
.map(_.getPath.toUri.toString) | ||
} | ||
|
||
private def writeBatch(id: Int, files: Seq[String]): Unit = { | ||
val path = new Path(metadataPath + "/" + id) | ||
val fs = FileSystem.get(sparkContext.hadoopConfiguration) | ||
val writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))) | ||
files.foreach { file => | ||
writer.write(file) | ||
writer.write("\n") | ||
} | ||
writer.close() | ||
logDebug(s"Wrote batch file $path") | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,8 +33,8 @@ object LastOptions { | |
class DefaultSource extends StreamSourceProvider with StreamSinkProvider { | ||
override def createSource( | ||
sqlContext: SQLContext, | ||
parameters: Map[String, String], | ||
schema: Option[StructType]): Source = { | ||
schema: Option[StructType], | ||
parameters: Map[String, String]): Source = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we have to add stuff we should probably add a schema too. |
||
LastOptions.parameters = parameters | ||
LastOptions.schema = schema | ||
new Source { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even if there are no files present, we should probably still defer to the source. Those that can support that will work and those that don't will throw the correct error message.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we need to return some
StructType
here. Any magic to defer that?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see, even
sqlContext.read.format("text").load()
doesn't work? I would rather fix that than hardcode this here.For sources like parquet/json it doesn't really make sense to let them point it at an empty directory so I would rather throw an error.