diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index 72afd4327fb7..7c3e9b4e6df4 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -93,10 +93,15 @@ impl RuntimeEnv { self.memory_manager.shrink_tracker_usage(delta) } - /// Registers an object store with scheme using a custom `ObjectStore` so that - /// an external file system or object storage system could be used against this context. + /// Registers a custom `ObjectStore` to be used when accessing a + /// specific scheme and host. This allows DataFusion to create + /// external tables from urls that do not have built in support + /// such as `hdfs://...`. /// - /// Returns the `ObjectStore` previously registered for this scheme, if any + /// Returns the [`ObjectStore`] previously registered for this + /// scheme, if any. + /// + /// See [`ObjectStoreRegistry`] for more details pub fn register_object_store( &self, scheme: impl AsRef, @@ -115,7 +120,9 @@ impl RuntimeEnv { self.table_factories.extend(table_factories) } - /// Retrieves a `ObjectStore` instance for a url + /// Retrieves a `ObjectStore` instance for a url by consulting the + /// registery. See [`ObjectStoreRegistry::get_by_url`] for more + /// details. pub fn object_store(&self, url: impl AsRef) -> Result> { self.object_store_registry .get_by_url(url) diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index c33e2bc14701..dbf348b09cd5 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -74,19 +74,30 @@ lazy_static! { /// any given file format. #[derive(Debug, Clone)] pub struct FileScanConfig { - /// Object store URL + /// Object store URL, used to get an [`ObjectStore`] instance from + /// [`RuntimeEnv::object_store`] pub object_store_url: ObjectStoreUrl, - /// Schema before projection. It contains the columns that are expected - /// to be in the files without the table partition columns. + /// Schema before `projection` is applied. It contains the all columns that may + /// appear in the files. It does not include table partition columns + /// that may be added. pub file_schema: SchemaRef, /// List of files to be processed, grouped into partitions + /// + /// Each file must have a schema of `file_schema` or a subset. If + /// a particular file has a subset, the missing columns are + /// padded with with NULLs. + /// + /// DataFusion may attempt to read each partition of files + /// concurrently, however files *within* a partition will be read + /// sequentially, one after the next. pub file_groups: Vec>, /// Estimated overall statistics of the files, taking `filters` into account. pub statistics: Statistics, /// Columns on which to project the data. Indexes that are higher than the /// number of columns of `file_schema` refer to `table_partition_cols`. pub projection: Option>, - /// The minimum number of records required from this source plan + /// The maximum number of records to read from this plan. If None, + /// all records after filtering are returned. pub limit: Option, /// The partitioning column names pub table_partition_cols: Vec,