Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions datafusion/core/tests/datasource/object_store_access.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,7 @@ async fn query_multi_csv_file() {
+---------+-------+-------+
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 4
- LIST prefix=data
Total Requests: 3
- GET (opts) path=data/file_0.csv
- GET (opts) path=data/file_1.csv
- GET (opts) path=data/file_2.csv
Expand All @@ -145,8 +144,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 4
- LIST prefix=data
Total Requests: 3
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
- GET (opts) path=data/a=3/b=30/c=300/file_3.csv
Expand Down Expand Up @@ -183,8 +181,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 2
- LIST prefix=data
Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
Expand All @@ -201,8 +198,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 2
- LIST prefix=data
Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
Expand Down Expand Up @@ -237,8 +233,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 2
- LIST prefix=data
Total Requests: 1
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
"
);
Expand Down
95 changes: 80 additions & 15 deletions datafusion/execution/src/cache/cache_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ use std::any::Any;
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::sync::Arc;
use std::time::Duration;

use super::cache_unit::{
DefaultListFilesCache, DEFAULT_LIST_FILES_CACHE_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL,
};

/// A cache for [`Statistics`].
///
Expand All @@ -42,8 +47,18 @@ pub type FileStatisticsCache =
/// especially when done over remote object stores.
///
/// See [`crate::runtime_env::RuntimeEnv`] for more details
pub type ListFilesCache =
Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>;
pub trait ListFilesCache:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think one of the things I would like to do in the Cache Manager caches is to segregate the cache eviction policy. Personally I think the user should be given an option on what is the eviction behaviour they want. wdyt @alamb @BlakeOrth ? I can work on getting some draft out this weekend on it.

CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>
{
// Returns the cache's object limit.
fn cache_limit(&self) -> usize;

// Returns the cache's object ttl.
fn cache_ttl(&self) -> Duration;

// Updates the cache with a new boject limit.
fn update_cache_limit(&self, limit: usize);
}

/// Generic file-embedded metadata used with [`FileMetadataCache`].
///
Expand Down Expand Up @@ -109,7 +124,7 @@ impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
}
}

impl Debug for dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> {
impl Debug for dyn ListFilesCache {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "Cache name: {} with length: {}", self.name(), self.len())
}
Expand All @@ -131,7 +146,7 @@ impl Debug for dyn FileMetadataCache {
#[derive(Debug)]
pub struct CacheManager {
file_statistic_cache: Option<FileStatisticsCache>,
list_files_cache: Option<ListFilesCache>,
list_files_cache: Option<Arc<dyn ListFilesCache>>,
file_metadata_cache: Arc<dyn FileMetadataCache>,
}

Expand All @@ -140,7 +155,17 @@ impl CacheManager {
let file_statistic_cache =
config.table_files_statistics_cache.as_ref().map(Arc::clone);

let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone);
let list_files_cache = config
.list_files_cache
.as_ref()
.map(Arc::clone)
.unwrap_or_else(|| {
Arc::new(DefaultListFilesCache::new(
// TODO: config
512 * 1024,
Duration::new(600, 0),
Comment on lines +164 to +166
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This POC doesn't implement any of the user configuration. This seems like a good opportunity to divide the work on this effort! We could get the base DefaultListFilesCache approved for merge without user configuration, and leave it disabled, and user configuration could be added by anyone who wants to contribute.

))
});

let file_metadata_cache = config
.file_metadata_cache
Expand All @@ -155,7 +180,7 @@ impl CacheManager {

Ok(Arc::new(CacheManager {
file_statistic_cache,
list_files_cache,
list_files_cache: Some(list_files_cache), // TODO: reinstate optionality
file_metadata_cache,
}))
}
Expand All @@ -166,10 +191,24 @@ impl CacheManager {
}

/// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
pub fn get_list_files_cache(&self) -> Option<ListFilesCache> {
pub fn get_list_files_cache(&self) -> Option<Arc<dyn ListFilesCache>> {
self.list_files_cache.clone()
}

/// Get the limit of the file embedded metadata cache.
pub fn get_list_files_cache_limit(&self) -> usize {
self.list_files_cache
.as_ref()
.map_or(DEFAULT_LIST_FILES_CACHE_LIMIT, |c| c.cache_limit())
}

/// Get the limit of the file embedded metadata cache.
pub fn get_list_files_cache_ttl(&self) -> Duration {
self.list_files_cache
.as_ref()
.map_or(DEFAULT_LIST_FILES_CACHE_TTL, |c| c.cache_ttl())
}

/// Get the file embedded metadata cache.
pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> {
Arc::clone(&self.file_metadata_cache)
Expand All @@ -189,13 +228,20 @@ pub struct CacheManagerConfig {
/// Avoid get same file statistics repeatedly in same datafusion session.
/// Default is disable. Fow now only supports Parquet files.
pub table_files_statistics_cache: Option<FileStatisticsCache>,
/// Enable cache of file metadata when listing files.
/// This setting avoids listing file meta of the same path repeatedly
/// in same session, which may be expensive in certain situations (e.g. remote object storage).
/// Enable caching of file metadata when listing files.
/// Enabling the cache avoids repeat list and metadata fetch operations, which may be expensive
/// in certain situations (e.g. remote object storage), for objects under paths that are
/// cached.
/// Note that if this option is enabled, DataFusion will not see any updates to the underlying
/// location.
/// Default is disable.
pub list_files_cache: Option<ListFilesCache>,
/// storage for at least `list_files_cache_ttl` duration.
/// Default is disabled.
pub list_files_cache: Option<Arc<dyn ListFilesCache>>,
/// Limit the number of objects to keep in the `list_files_cache`. Default: ~125k objects
pub list_files_cache_limit: usize,
/// The duration the list files cache will consider an entry valid after insertion. Note that
/// changes to the underlying storage system, such as adding or removing data, will not be
/// visible until an entry expires. Default: 10 minutes.
pub list_files_cache_ttl: Duration,
/// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a
/// data file (e.g., Parquet footer and page metadata).
/// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
Expand All @@ -209,6 +255,8 @@ impl Default for CacheManagerConfig {
Self {
table_files_statistics_cache: Default::default(),
list_files_cache: Default::default(),
list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_LIMIT,
list_files_cache_ttl: DEFAULT_LIST_FILES_CACHE_TTL,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some usecases don't need a TTL, we should provide a way to keep that disable as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well thinking more, I understand why you have kept it.. I feel it diverges from the metadata cache and could confuse the end users somewhat

file_metadata_cache: Default::default(),
metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT,
}
Expand All @@ -228,13 +276,30 @@ impl CacheManagerConfig {
}

/// Set the cache for listing files.
///
///
/// Default is `None` (disabled).
pub fn with_list_files_cache(mut self, cache: Option<ListFilesCache>) -> Self {
pub fn with_list_files_cache(
mut self,
cache: Option<Arc<dyn ListFilesCache>>,
) -> Self {
self.list_files_cache = cache;
self
}

pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self {
self.list_files_cache_limit = limit;
self
}

pub fn with_list_files_cache_ttl(mut self, ttl: Duration) -> Self {
self.list_files_cache_ttl = ttl;
if ttl.is_zero() {
self.list_files_cache = None
}

self
}

/// Sets the cache for file-embedded metadata.
///
/// Default is a [`DefaultFilesMetadataCache`].
Expand Down
Loading
Loading