-
Notifications
You must be signed in to change notification settings - Fork 28k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files #24483
[SPARK-27588] Binary file data source fails fast and doesn't attempt to read very large files #24483
Changes from 4 commits
11ff2cc
1577966
15e7700
f182606
0d6f92c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,11 +24,13 @@ import org.apache.hadoop.conf.Configuration | |
import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path} | ||
import org.apache.hadoop.mapreduce.Job | ||
|
||
import org.apache.spark.SparkException | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter | ||
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} | ||
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} | ||
import org.apache.spark.sql.internal.SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH | ||
import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.unsafe.types.UTF8String | ||
|
@@ -99,6 +101,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { | |
val binaryFileSourceOptions = new BinaryFileSourceOptions(options) | ||
val pathGlobPattern = binaryFileSourceOptions.pathGlobFilter | ||
val filterFuncs = filters.map(filter => createFilterFunction(filter)) | ||
val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: we can define a method in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? The logic is not general enough to be applied outside binary data source. |
||
|
||
file: PartitionedFile => { | ||
val path = new Path(file.filePath) | ||
|
@@ -115,6 +118,11 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister { | |
case (MODIFICATION_TIME, i) => | ||
writer.write(i, DateTimeUtils.fromMillis(status.getModificationTime)) | ||
case (CONTENT, i) => | ||
if (status.getLen > maxLength) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can move this to line 113. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get it. The conf is to prevent reading very large files that we are sure about failures. User can still use the data source if they don't need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I am actually OK with either way. |
||
throw new SparkException( | ||
s"The length of ${status.getPath} is ${status.getLen}, " + | ||
s"which exceeds the max length allowed: ${maxLength}.") | ||
} | ||
val stream = fs.open(status.getPath) | ||
try { | ||
writer.write(i, ByteStreams.toByteArray(stream)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,10 +27,12 @@ import com.google.common.io.{ByteStreams, Closeables} | |
import org.apache.hadoop.fs.{FileStatus, FileSystem, GlobFilter, Path} | ||
import org.mockito.Mockito.{mock, when} | ||
|
||
import org.apache.spark.sql.{QueryTest, Row} | ||
import org.apache.spark.SparkException | ||
import org.apache.spark.sql.{DataFrame, QueryTest, Row} | ||
import org.apache.spark.sql.catalyst.encoders.RowEncoder | ||
import org.apache.spark.sql.execution.datasources.PartitionedFile | ||
import org.apache.spark.sql.functions.col | ||
import org.apache.spark.sql.internal.SQLConf.{CONF_SOURCES_BINARY_FILE_MAX_LENGTH, SOURCES_BINARY_FILE_MAX_LENGTH} | ||
import org.apache.spark.sql.sources._ | ||
import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} | ||
import org.apache.spark.sql.types._ | ||
|
@@ -339,4 +341,31 @@ class BinaryFileFormatSuite extends QueryTest with SharedSQLContext with SQLTest | |
assert(df.select("LENGTH").first().getLong(0) === content.length, | ||
"column pruning should be case insensitive") | ||
} | ||
|
||
test("fail fast and do not attempt to read if a file is too big") { | ||
assert(spark.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH) === Int.MaxValue) | ||
withTempPath { file => | ||
val path = file.getPath | ||
val content = "123".getBytes | ||
Files.write(file.toPath, content, StandardOpenOption.CREATE, StandardOpenOption.WRITE) | ||
def readContent(): DataFrame = { | ||
spark.read.format(BINARY_FILE) | ||
.load(path) | ||
.select(CONTENT) | ||
} | ||
val expected = Seq(Row(content)) | ||
QueryTest.checkAnswer(readContent(), expected) | ||
withSQLConf(CONF_SOURCES_BINARY_FILE_MAX_LENGTH -> content.length.toString) { | ||
QueryTest.checkAnswer(readContent(), expected) | ||
} | ||
// Disable read. If the implementation attempts to read, the exception would be different. | ||
file.setReadable(false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the test can still pass without this line. Maybe we can remove it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we still set the max to |
||
val caught = intercept[SparkException] { | ||
withSQLConf(CONF_SOURCES_BINARY_FILE_MAX_LENGTH -> (content.length - 1).toString) { | ||
QueryTest.checkAnswer(readContent(), expected) | ||
} | ||
} | ||
assert(caught.getMessage.contains("exceeds the max length allowed")) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: I think we can follow the other SQLConf here by putting the conf key into
buildConf
without assigning it into a variable. Also, we can remove theprivate[sql]
.We can set the key with
SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH.key
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.