comments

apache · Mar 11, 2016 · 65596df · 65596df
1 parent 7ad3119
commit 65596df
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 2 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala
@@ -22,6 +22,10 @@ import org.apache.spark.sql.catalyst.analysis._
 private[spark] trait CatalystConf {
   def caseSensitiveAnalysis: Boolean
 
+  /**
+   * Returns the [[Resolver]] for the current configuration, which can be used to determin if two
+   * identifiers are equal.
+   */
   def resolver: Resolver = {
     if (caseSensitiveAnalysis) {
       caseSensitiveResolution

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -38,6 +38,7 @@ case class PartitionedFile(
  * directories.
  *
  * IMPLEMENT ME: This is just a placeholder for a future implementation.
+ * TODO: This currently does not take locality information about the files into account.
  */
 case class FilePartition(val index: Int, files: Seq[PartitionedFile]) extends Partition
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -42,11 +42,11 @@ import org.apache.spark.sql.types._
  *    well.
  *  - Construct a reader function by passing filters and the schema into the FileFormat.
  *  - Using an partition pruning predicates, enumerate the list of files that should be read.
- *  - Split the files into tasks and construct a FileScanRDD
+ *  - Split the files into tasks and construct a FileScanRDD.
  *  - Add any projection or filters that must be evaluated after the scan.
  *
  * Files are assigned into tasks using the following algorithm:
- *  - If the table is bucketed: group files by bucket id into the correct number of partitions.
+ *  - If the table is bucketed, group files by bucket id into the correct number of partitions.
  *  - If the table is not bucketed or bucketing is turned off:
  *   - If any file is larger than the threshold, split it into pieces based on that threshold
  *   - Sort the files by decreasing file size.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -498,6 +498,15 @@ trait FileCatalog {
 
   def partitionSpec(): PartitionSpec
 
+  /**
+   * Returns all valid files grouped into partitions when the data is partitioned. If the data is
+   * unpartitioned, this will return a single partition with not partition values.
+   *
+   * @param filters the filters used to prune which partitions are returned.  These filters must
+   *                only refer to partition columns and this method will only return files
+   *                where these predicates are guaranteed to evaluate to `true`.  Thus, these
+   *                filters will not need to be evaluated again on the returned data.
+   */
   def listFiles(filters: Seq[Expression]): Seq[Partition]
 
   def allFiles(): Seq[FileStatus]