Skip to content

Commit

Permalink
[SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath s…
Browse files Browse the repository at this point in the history
…hould respect case insensitivity

### What changes were proposed in this pull request?

This PR proposes to make the datasource options at `PartitioningAwareFileIndex` respect case insensitivity consistently:
- `pathGlobFilter`
- `recursiveFileLookup `
- `basePath`

### Why are the changes needed?

To support consistent case insensitivity in datasource options.

### Does this PR introduce _any_ user-facing change?

Yes, now users can also use case insensitive options such as `PathglobFilter`.

### How was this patch tested?

Unittest were added. It reuses existing tests and adds extra clues to make it easier to track when the test is broken.

Closes #29165 from HyukjinKwon/SPARK-32368.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
HyukjinKwon authored and dongjoon-hyun committed Jul 20, 2020
1 parent ffdca82 commit 133c5ed
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 18 deletions.
Expand Up @@ -56,14 +56,17 @@ abstract class PartitioningAwareFileIndex(

protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]

protected lazy val pathGlobFilter = parameters.get("pathGlobFilter").map(new GlobFilter(_))
private val caseInsensitiveMap = CaseInsensitiveMap(parameters)

protected lazy val pathGlobFilter: Option[GlobFilter] =
caseInsensitiveMap.get("pathGlobFilter").map(new GlobFilter(_))

protected def matchGlobPattern(file: FileStatus): Boolean = {
pathGlobFilter.forall(_.accept(file.getPath))
}

protected lazy val recursiveFileLookup = {
parameters.getOrElse("recursiveFileLookup", "false").toBoolean
protected lazy val recursiveFileLookup: Boolean = {
caseInsensitiveMap.getOrElse("recursiveFileLookup", "false").toBoolean
}

override def listFiles(
Expand Down Expand Up @@ -215,7 +218,7 @@ abstract class PartitioningAwareFileIndex(
* and the returned DataFrame will have the column of `something`.
*/
private def basePaths: Set[Path] = {
parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
caseInsensitiveMap.get(BASE_PATH_PARAM).map(new Path(_)) match {
case Some(userDefinedBasePath) =>
val fs = userDefinedBasePath.getFileSystem(hadoopConf)
if (!fs.isDirectory(userDefinedBasePath)) {
Expand Down
Expand Up @@ -633,13 +633,15 @@ class FileBasedDataSourceSuite extends QueryTest

assert(fileList.toSet === expectedFileList.toSet)

val fileList2 = spark.read.format("binaryFile")
.option("recursiveFileLookup", true)
.option("pathGlobFilter", "*.bin")
.load(dataPath)
.select("path").collect().map(_.getString(0))

assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
withClue("SPARK-32368: 'recursiveFileLookup' and 'pathGlobFilter' can be case insensitive") {
val fileList2 = spark.read.format("binaryFile")
.option("RecuRsivefileLookup", true)
.option("PaThglobFilter", "*.bin")
.load(dataPath)
.select("path").collect().map(_.getString(0))

assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
}
}
}

Expand Down
Expand Up @@ -367,13 +367,15 @@ class FileIndexSuite extends SharedSparkSession {
val wrongBasePath = new File(dir, "unknown")
// basePath must be a directory
wrongBasePath.mkdir()
val parameters = Map("basePath" -> wrongBasePath.getCanonicalPath)
val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
val msg = intercept[IllegalArgumentException] {
// trigger inferPartitioning()
fileIndex.partitionSpec()
}.getMessage
assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
withClue("SPARK-32368: 'basePath' can be case insensitive") {
val parameters = Map("bAsepAtH" -> wrongBasePath.getCanonicalPath)
val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
val msg = intercept[IllegalArgumentException] {
// trigger inferPartitioning()
fileIndex.partitionSpec()
}.getMessage
assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
}
}
}

Expand Down

0 comments on commit 133c5ed

Please sign in to comment.