-
Notifications
You must be signed in to change notification settings - Fork 1.8k
POC for DefaultListFilesCache #18855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,11 @@ use std::any::Any; | |
| use std::collections::HashMap; | ||
| use std::fmt::{Debug, Formatter}; | ||
| use std::sync::Arc; | ||
| use std::time::Duration; | ||
|
|
||
| use super::cache_unit::{ | ||
| DefaultListFilesCache, DEFAULT_LIST_FILES_CACHE_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL, | ||
| }; | ||
|
|
||
| /// A cache for [`Statistics`]. | ||
| /// | ||
|
|
@@ -42,8 +47,18 @@ pub type FileStatisticsCache = | |
| /// especially when done over remote object stores. | ||
| /// | ||
| /// See [`crate::runtime_env::RuntimeEnv`] for more details | ||
| pub type ListFilesCache = | ||
| Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>; | ||
| pub trait ListFilesCache: | ||
| CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> | ||
| { | ||
| // Returns the cache's object limit. | ||
| fn cache_limit(&self) -> usize; | ||
|
|
||
| // Returns the cache's object ttl. | ||
| fn cache_ttl(&self) -> Duration; | ||
|
|
||
| // Updates the cache with a new boject limit. | ||
| fn update_cache_limit(&self, limit: usize); | ||
| } | ||
|
|
||
| /// Generic file-embedded metadata used with [`FileMetadataCache`]. | ||
| /// | ||
|
|
@@ -109,7 +124,7 @@ impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> { | |
| } | ||
| } | ||
|
|
||
| impl Debug for dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> { | ||
| impl Debug for dyn ListFilesCache { | ||
| fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { | ||
| write!(f, "Cache name: {} with length: {}", self.name(), self.len()) | ||
| } | ||
|
|
@@ -131,7 +146,7 @@ impl Debug for dyn FileMetadataCache { | |
| #[derive(Debug)] | ||
| pub struct CacheManager { | ||
| file_statistic_cache: Option<FileStatisticsCache>, | ||
| list_files_cache: Option<ListFilesCache>, | ||
| list_files_cache: Option<Arc<dyn ListFilesCache>>, | ||
| file_metadata_cache: Arc<dyn FileMetadataCache>, | ||
| } | ||
|
|
||
|
|
@@ -140,7 +155,17 @@ impl CacheManager { | |
| let file_statistic_cache = | ||
| config.table_files_statistics_cache.as_ref().map(Arc::clone); | ||
|
|
||
| let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone); | ||
| let list_files_cache = config | ||
| .list_files_cache | ||
| .as_ref() | ||
| .map(Arc::clone) | ||
| .unwrap_or_else(|| { | ||
| Arc::new(DefaultListFilesCache::new( | ||
| // TODO: config | ||
| 512 * 1024, | ||
| Duration::new(600, 0), | ||
|
Comment on lines
+164
to
+166
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This POC doesn't implement any of the user configuration. This seems like a good opportunity to divide the work on this effort! We could get the base |
||
| )) | ||
| }); | ||
|
|
||
| let file_metadata_cache = config | ||
| .file_metadata_cache | ||
|
|
@@ -155,7 +180,7 @@ impl CacheManager { | |
|
|
||
| Ok(Arc::new(CacheManager { | ||
| file_statistic_cache, | ||
| list_files_cache, | ||
| list_files_cache: Some(list_files_cache), // TODO: reinstate optionality | ||
| file_metadata_cache, | ||
| })) | ||
| } | ||
|
|
@@ -166,10 +191,24 @@ impl CacheManager { | |
| } | ||
|
|
||
| /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path. | ||
| pub fn get_list_files_cache(&self) -> Option<ListFilesCache> { | ||
| pub fn get_list_files_cache(&self) -> Option<Arc<dyn ListFilesCache>> { | ||
| self.list_files_cache.clone() | ||
| } | ||
|
|
||
| /// Get the limit of the file embedded metadata cache. | ||
| pub fn get_list_files_cache_limit(&self) -> usize { | ||
| self.list_files_cache | ||
| .as_ref() | ||
| .map_or(DEFAULT_LIST_FILES_CACHE_LIMIT, |c| c.cache_limit()) | ||
| } | ||
|
|
||
| /// Get the limit of the file embedded metadata cache. | ||
| pub fn get_list_files_cache_ttl(&self) -> Duration { | ||
| self.list_files_cache | ||
| .as_ref() | ||
| .map_or(DEFAULT_LIST_FILES_CACHE_TTL, |c| c.cache_ttl()) | ||
| } | ||
|
|
||
| /// Get the file embedded metadata cache. | ||
| pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> { | ||
| Arc::clone(&self.file_metadata_cache) | ||
|
|
@@ -189,13 +228,20 @@ pub struct CacheManagerConfig { | |
| /// Avoid get same file statistics repeatedly in same datafusion session. | ||
| /// Default is disable. Fow now only supports Parquet files. | ||
| pub table_files_statistics_cache: Option<FileStatisticsCache>, | ||
| /// Enable cache of file metadata when listing files. | ||
| /// This setting avoids listing file meta of the same path repeatedly | ||
| /// in same session, which may be expensive in certain situations (e.g. remote object storage). | ||
| /// Enable caching of file metadata when listing files. | ||
| /// Enabling the cache avoids repeat list and metadata fetch operations, which may be expensive | ||
| /// in certain situations (e.g. remote object storage), for objects under paths that are | ||
| /// cached. | ||
| /// Note that if this option is enabled, DataFusion will not see any updates to the underlying | ||
| /// location. | ||
| /// Default is disable. | ||
| pub list_files_cache: Option<ListFilesCache>, | ||
| /// storage for at least `list_files_cache_ttl` duration. | ||
| /// Default is disabled. | ||
| pub list_files_cache: Option<Arc<dyn ListFilesCache>>, | ||
| /// Limit the number of objects to keep in the `list_files_cache`. Default: ~125k objects | ||
| pub list_files_cache_limit: usize, | ||
| /// The duration the list files cache will consider an entry valid after insertion. Note that | ||
| /// changes to the underlying storage system, such as adding or removing data, will not be | ||
| /// visible until an entry expires. Default: 10 minutes. | ||
| pub list_files_cache_ttl: Duration, | ||
| /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a | ||
| /// data file (e.g., Parquet footer and page metadata). | ||
| /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`]. | ||
|
|
@@ -209,6 +255,8 @@ impl Default for CacheManagerConfig { | |
| Self { | ||
| table_files_statistics_cache: Default::default(), | ||
| list_files_cache: Default::default(), | ||
| list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_LIMIT, | ||
| list_files_cache_ttl: DEFAULT_LIST_FILES_CACHE_TTL, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some usecases don't need a TTL, we should provide a way to keep that disable as well.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well thinking more, I understand why you have kept it.. I feel it diverges from the metadata cache and could confuse the end users somewhat |
||
| file_metadata_cache: Default::default(), | ||
| metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT, | ||
| } | ||
|
|
@@ -228,13 +276,30 @@ impl CacheManagerConfig { | |
| } | ||
|
|
||
| /// Set the cache for listing files. | ||
| /// | ||
| /// | ||
| /// Default is `None` (disabled). | ||
| pub fn with_list_files_cache(mut self, cache: Option<ListFilesCache>) -> Self { | ||
| pub fn with_list_files_cache( | ||
| mut self, | ||
| cache: Option<Arc<dyn ListFilesCache>>, | ||
| ) -> Self { | ||
| self.list_files_cache = cache; | ||
| self | ||
| } | ||
|
|
||
| pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self { | ||
| self.list_files_cache_limit = limit; | ||
| self | ||
| } | ||
|
|
||
| pub fn with_list_files_cache_ttl(mut self, ttl: Duration) -> Self { | ||
| self.list_files_cache_ttl = ttl; | ||
| if ttl.is_zero() { | ||
| self.list_files_cache = None | ||
| } | ||
|
|
||
| self | ||
| } | ||
|
|
||
| /// Sets the cache for file-embedded metadata. | ||
| /// | ||
| /// Default is a [`DefaultFilesMetadataCache`]. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think one of the things I would like to do in the Cache Manager caches is to segregate the cache eviction policy. Personally I think the user should be given an option on what is the eviction behaviour they want. wdyt @alamb @BlakeOrth ? I can work on getting some draft out this weekend on it.