Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions datafusion/core/src/execution/runtime_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,15 @@ impl RuntimeEnv {
self.memory_manager.shrink_tracker_usage(delta)
}

/// Registers an object store with scheme using a custom `ObjectStore` so that
/// an external file system or object storage system could be used against this context.
/// Registers a custom `ObjectStore` to be used when accessing a
/// specific scheme and host. This allows DataFusion to create
/// external tables from urls that do not have built in support
/// such as `hdfs://...`.
///
/// Returns the `ObjectStore` previously registered for this scheme, if any
/// Returns the [`ObjectStore`] previously registered for this
/// scheme, if any.
///
/// See [`ObjectStoreRegistry`] for more details
pub fn register_object_store(
&self,
scheme: impl AsRef<str>,
Expand All @@ -115,7 +120,9 @@ impl RuntimeEnv {
self.table_factories.extend(table_factories)
}

/// Retrieves a `ObjectStore` instance for a url
/// Retrieves a `ObjectStore` instance for a url by consulting the
/// registery. See [`ObjectStoreRegistry::get_by_url`] for more
/// details.
pub fn object_store(&self, url: impl AsRef<Url>) -> Result<Arc<dyn ObjectStore>> {
self.object_store_registry
.get_by_url(url)
Expand Down
19 changes: 15 additions & 4 deletions datafusion/core/src/physical_plan/file_format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,30 @@ lazy_static! {
/// any given file format.
#[derive(Debug, Clone)]
pub struct FileScanConfig {
/// Object store URL
/// Object store URL, used to get an [`ObjectStore`] instance from
/// [`RuntimeEnv::object_store`]
pub object_store_url: ObjectStoreUrl,
/// Schema before projection. It contains the columns that are expected
/// to be in the files without the table partition columns.
/// Schema before `projection` is applied. It contains the all columns that may
/// appear in the files. It does not include table partition columns
/// that may be added.
pub file_schema: SchemaRef,
/// List of files to be processed, grouped into partitions
///
/// Each file must have a schema of `file_schema` or a subset. If
/// a particular file has a subset, the missing columns are
/// padded with with NULLs.
///
/// DataFusion may attempt to read each partition of files
/// concurrently, however files *within* a partition will be read
/// sequentially, one after the next.
pub file_groups: Vec<Vec<PartitionedFile>>,
/// Estimated overall statistics of the files, taking `filters` into account.
pub statistics: Statistics,
/// Columns on which to project the data. Indexes that are higher than the
/// number of columns of `file_schema` refer to `table_partition_cols`.
pub projection: Option<Vec<usize>>,
/// The minimum number of records required from this source plan
/// The maximum number of records to read from this plan. If None,
/// all records after filtering are returned.
pub limit: Option<usize>,
/// The partitioning column names
pub table_partition_cols: Vec<String>,
Expand Down