diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala index ed979d51568c8..35884139b6be8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala @@ -22,6 +22,10 @@ import org.apache.spark.sql.catalyst.analysis._ private[spark] trait CatalystConf { def caseSensitiveAnalysis: Boolean + /** + * Returns the [[Resolver]] for the current configuration, which can be used to determin if two + * identifiers are equal. + */ def resolver: Resolver = { if (caseSensitiveAnalysis) { caseSensitiveResolution diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index ce1e2ffd9fccf..e2cbbc34d91a4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -38,6 +38,7 @@ case class PartitionedFile( * directories. * * IMPLEMENT ME: This is just a placeholder for a future implementation. + * TODO: This currently does not take locality information about the files into account. */ case class FilePartition(val index: Int, files: Seq[PartitionedFile]) extends Partition diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 1a3036ce64950..f548ef13e0e02 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -42,11 +42,11 @@ import org.apache.spark.sql.types._ * well. * - Construct a reader function by passing filters and the schema into the FileFormat. * - Using an partition pruning predicates, enumerate the list of files that should be read. - * - Split the files into tasks and construct a FileScanRDD + * - Split the files into tasks and construct a FileScanRDD. * - Add any projection or filters that must be evaluated after the scan. * * Files are assigned into tasks using the following algorithm: - * - If the table is bucketed: group files by bucket id into the correct number of partitions. + * - If the table is bucketed, group files by bucket id into the correct number of partitions. * - If the table is not bucketed or bucketing is turned off: * - If any file is larger than the threshold, split it into pieces based on that threshold * - Sort the files by decreasing file size. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index fd89282ed430f..81e6e4e5b6bf6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -498,6 +498,15 @@ trait FileCatalog { def partitionSpec(): PartitionSpec + /** + * Returns all valid files grouped into partitions when the data is partitioned. If the data is + * unpartitioned, this will return a single partition with not partition values. + * + * @param filters the filters used to prune which partitions are returned. These filters must + * only refer to partition columns and this method will only return files + * where these predicates are guaranteed to evaluate to `true`. Thus, these + * filters will not need to be evaluated again on the returned data. + */ def listFiles(filters: Seq[Expression]): Seq[Partition] def allFiles(): Seq[FileStatus]