diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index 4a2b68fb11b0..87f6588dad71 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -117,8 +117,7 @@ async fn query_multi_csv_file() { +---------+-------+-------+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST prefix=data + Total Requests: 3 - GET (opts) path=data/file_0.csv - GET (opts) path=data/file_1.csv - GET (opts) path=data/file_2.csv @@ -145,8 +144,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST prefix=data + Total Requests: 3 - GET (opts) path=data/a=1/b=10/c=100/file_1.csv - GET (opts) path=data/a=2/b=20/c=200/file_2.csv - GET (opts) path=data/a=3/b=30/c=300/file_3.csv @@ -183,8 +181,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -201,8 +198,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -237,8 +233,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=1/b=10/c=100/file_1.csv " ); diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs index 98be9ce34b28..4b76638e281a 100644 --- a/datafusion/execution/src/cache/cache_manager.rs +++ b/datafusion/execution/src/cache/cache_manager.rs @@ -24,6 +24,11 @@ use std::any::Any; use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::sync::Arc; +use std::time::Duration; + +use super::cache_unit::{ + DefaultListFilesCache, DEFAULT_LIST_FILES_CACHE_LIMIT, DEFAULT_LIST_FILES_CACHE_TTL, +}; /// A cache for [`Statistics`]. /// @@ -42,8 +47,18 @@ pub type FileStatisticsCache = /// especially when done over remote object stores. /// /// See [`crate::runtime_env::RuntimeEnv`] for more details -pub type ListFilesCache = - Arc>, Extra = ObjectMeta>>; +pub trait ListFilesCache: + CacheAccessor>, Extra = ObjectMeta> +{ + // Returns the cache's object limit. + fn cache_limit(&self) -> usize; + + // Returns the cache's object ttl. + fn cache_ttl(&self) -> Duration; + + // Updates the cache with a new boject limit. + fn update_cache_limit(&self, limit: usize); +} /// Generic file-embedded metadata used with [`FileMetadataCache`]. /// @@ -109,7 +124,7 @@ impl Debug for dyn CacheAccessor, Extra = ObjectMeta> { } } -impl Debug for dyn CacheAccessor>, Extra = ObjectMeta> { +impl Debug for dyn ListFilesCache { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "Cache name: {} with length: {}", self.name(), self.len()) } @@ -131,7 +146,7 @@ impl Debug for dyn FileMetadataCache { #[derive(Debug)] pub struct CacheManager { file_statistic_cache: Option, - list_files_cache: Option, + list_files_cache: Option>, file_metadata_cache: Arc, } @@ -140,7 +155,17 @@ impl CacheManager { let file_statistic_cache = config.table_files_statistics_cache.as_ref().map(Arc::clone); - let list_files_cache = config.list_files_cache.as_ref().map(Arc::clone); + let list_files_cache = config + .list_files_cache + .as_ref() + .map(Arc::clone) + .unwrap_or_else(|| { + Arc::new(DefaultListFilesCache::new( + // TODO: config + 512 * 1024, + Duration::new(600, 0), + )) + }); let file_metadata_cache = config .file_metadata_cache @@ -155,7 +180,7 @@ impl CacheManager { Ok(Arc::new(CacheManager { file_statistic_cache, - list_files_cache, + list_files_cache: Some(list_files_cache), // TODO: reinstate optionality file_metadata_cache, })) } @@ -166,10 +191,24 @@ impl CacheManager { } /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path. - pub fn get_list_files_cache(&self) -> Option { + pub fn get_list_files_cache(&self) -> Option> { self.list_files_cache.clone() } + /// Get the limit of the file embedded metadata cache. + pub fn get_list_files_cache_limit(&self) -> usize { + self.list_files_cache + .as_ref() + .map_or(DEFAULT_LIST_FILES_CACHE_LIMIT, |c| c.cache_limit()) + } + + /// Get the limit of the file embedded metadata cache. + pub fn get_list_files_cache_ttl(&self) -> Duration { + self.list_files_cache + .as_ref() + .map_or(DEFAULT_LIST_FILES_CACHE_TTL, |c| c.cache_ttl()) + } + /// Get the file embedded metadata cache. pub fn get_file_metadata_cache(&self) -> Arc { Arc::clone(&self.file_metadata_cache) @@ -189,13 +228,20 @@ pub struct CacheManagerConfig { /// Avoid get same file statistics repeatedly in same datafusion session. /// Default is disable. Fow now only supports Parquet files. pub table_files_statistics_cache: Option, - /// Enable cache of file metadata when listing files. - /// This setting avoids listing file meta of the same path repeatedly - /// in same session, which may be expensive in certain situations (e.g. remote object storage). + /// Enable caching of file metadata when listing files. + /// Enabling the cache avoids repeat list and metadata fetch operations, which may be expensive + /// in certain situations (e.g. remote object storage), for objects under paths that are + /// cached. /// Note that if this option is enabled, DataFusion will not see any updates to the underlying - /// location. - /// Default is disable. - pub list_files_cache: Option, + /// storage for at least `list_files_cache_ttl` duration. + /// Default is disabled. + pub list_files_cache: Option>, + /// Limit the number of objects to keep in the `list_files_cache`. Default: ~125k objects + pub list_files_cache_limit: usize, + /// The duration the list files cache will consider an entry valid after insertion. Note that + /// changes to the underlying storage system, such as adding or removing data, will not be + /// visible until an entry expires. Default: 10 minutes. + pub list_files_cache_ttl: Duration, /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a /// data file (e.g., Parquet footer and page metadata). /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`]. @@ -209,6 +255,8 @@ impl Default for CacheManagerConfig { Self { table_files_statistics_cache: Default::default(), list_files_cache: Default::default(), + list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_LIMIT, + list_files_cache_ttl: DEFAULT_LIST_FILES_CACHE_TTL, file_metadata_cache: Default::default(), metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT, } @@ -228,13 +276,30 @@ impl CacheManagerConfig { } /// Set the cache for listing files. - /// + /// /// Default is `None` (disabled). - pub fn with_list_files_cache(mut self, cache: Option) -> Self { + pub fn with_list_files_cache( + mut self, + cache: Option>, + ) -> Self { self.list_files_cache = cache; self } + pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self { + self.list_files_cache_limit = limit; + self + } + + pub fn with_list_files_cache_ttl(mut self, ttl: Duration) -> Self { + self.list_files_cache_ttl = ttl; + if ttl.is_zero() { + self.list_files_cache = None + } + + self + } + /// Sets the cache for file-embedded metadata. /// /// Default is a [`DefaultFilesMetadataCache`]. diff --git a/datafusion/execution/src/cache/cache_unit.rs b/datafusion/execution/src/cache/cache_unit.rs index 0f9cff54d587..faef9c4e245f 100644 --- a/datafusion/execution/src/cache/cache_unit.rs +++ b/datafusion/execution/src/cache/cache_unit.rs @@ -17,13 +17,15 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; +use std::time::Duration; use crate::cache::cache_manager::{ - FileMetadata, FileMetadataCache, FileMetadataCacheEntry, + FileMetadata, FileMetadataCache, FileMetadataCacheEntry, ListFilesCache, }; use crate::cache::lru_queue::LruQueue; use crate::cache::CacheAccessor; +use datafusion_common::instant::Instant; use datafusion_common::Statistics; use dashmap::DashMap; @@ -111,27 +113,182 @@ impl CacheAccessor> for DefaultFileStatisticsCache { /// /// Collected files metadata for listing files. /// -/// Cache is not invalided until user calls [`Self::remove`] or [`Self::clear`]. +/// # Internal details +/// +/// The `capacity` parameter controls the maximum number of entries in the cache, using a Least +/// Recently Used eviction algorithm. When adding a new entry, if the total number of entries in +/// the cache exceeds `capacity`, the least recently used entries are evicted until the total +/// entries are lower than the `capacity`. /// /// [`ListFilesCache`]: crate::cache::cache_manager::ListFilesCache #[derive(Default)] pub struct DefaultListFilesCache { - statistics: DashMap>>, + state: Mutex, +} + +impl DefaultListFilesCache { + pub fn new(capacity: usize, ttl: Duration) -> Self { + Self { + state: Mutex::new(DefaultListFilesCacheState::new(capacity, ttl)), + } + } + + pub fn cache_limit(&self) -> usize { + self.state.lock().unwrap().capacity + } + + pub fn cache_ttl(&self) -> Duration { + self.state.lock().unwrap().ttl + } +} + +pub(super) const DEFAULT_LIST_FILES_CACHE_LIMIT: usize = 128 * 1024; // ~130k objects +pub(super) const DEFAULT_LIST_FILES_CACHE_TTL: Duration = Duration::new(600, 0); // 10min + +pub struct DefaultListFilesCacheState { + lru_queue: LruQueue>, Instant)>, + capacity: usize, // TODO: do "bytes" matter here, or should we stick with "entries"? + size: usize, + ttl: Duration, +} + +// TODO: Do we even want to support "default" here? +impl Default for DefaultListFilesCacheState { + fn default() -> Self { + Self { + lru_queue: LruQueue::new(), + capacity: DEFAULT_LIST_FILES_CACHE_LIMIT, + size: 0, + ttl: DEFAULT_LIST_FILES_CACHE_TTL, + } + } +} + +impl DefaultListFilesCacheState { + fn new(capacity: usize, ttl: Duration) -> Self { + Self { + lru_queue: LruQueue::new(), + capacity, + size: 0, + ttl, + } + } + + fn get(&mut self, key: &Path) -> Option>> { + let (object_metas, expires) = self.lru_queue.get(key)?; + + if Instant::now() > *expires { + self.remove(key); + None + } else { + Some(Arc::clone(object_metas)) + } + } + + fn contains_key(&mut self, k: &Path) -> bool { + let Some((_, expires)) = self.lru_queue.peek(k) else { + return false; + }; + + if Instant::now() > *expires { + self.remove(k); + return false; + } + + true + } + + fn put( + &mut self, + key: &Path, + value: Arc>, + ) -> Option>> { + let value_size = value.len(); + + // no point in trying to add this value to the cache if it cannot fit entirely + if value_size > self.capacity { + return None; + } + + // if the key is already in the cache, the old value is removed + let expires = Instant::now() + self.ttl; + let old_value = self.lru_queue.put(key.clone(), (value, expires)); + self.size += value_size; + if let Some((ref old_metas, _)) = old_value { + self.size -= old_metas.len(); + } + + self.evict_entries(); + + old_value.map(|v| v.0) + } + + fn evict_entries(&mut self) { + while self.size > self.capacity { + if let Some(removed) = self.lru_queue.pop() { + let metas: Arc> = removed.1 .0; + self.size -= metas.len(); + } else { + // cache is empty while memory_used > memory_limit, cannot happen + debug_assert!( + false, + "cache is empty while memory_used > memory_limit, cannot happen" + ); + return; + } + } + } + + /// Removes an entry from the cache and returns it, if it exists. + fn remove(&mut self, k: &Path) -> Option>> { + if let Some((old_metas, _)) = self.lru_queue.remove(k) { + self.capacity -= old_metas.len(); + Some(old_metas) + } else { + None + } + } + + /// Returns the number of entries currently cached. + fn len(&self) -> usize { + self.lru_queue.len() + } + + /// Removes all entries from the cache. + fn clear(&mut self) { + self.lru_queue.clear(); + self.capacity = 0; + } +} + +impl ListFilesCache for DefaultListFilesCache { + fn cache_limit(&self) -> usize { + let state = self.state.lock().unwrap(); + state.capacity + } + + fn cache_ttl(&self) -> Duration { + let state = self.state.lock().unwrap(); + state.ttl + } + + fn update_cache_limit(&self, limit: usize) { + let mut state = self.state.lock().unwrap(); + state.capacity = limit; + state.evict_entries(); + } } impl CacheAccessor>> for DefaultListFilesCache { type Extra = ObjectMeta; fn get(&self, k: &Path) -> Option>> { - self.statistics.get(k).map(|x| Arc::clone(x.value())) + let mut state = self.state.lock().unwrap(); + state.get(k) } - fn get_with_extra( - &self, - _k: &Path, - _e: &Self::Extra, - ) -> Option>> { - panic!("Not supported DefaultListFilesCache get_with_extra") + fn get_with_extra(&self, k: &Path, _e: &Self::Extra) -> Option>> { + self.get(k) } fn put( @@ -139,36 +296,41 @@ impl CacheAccessor>> for DefaultListFilesCache { key: &Path, value: Arc>, ) -> Option>> { - self.statistics.insert(key.clone(), value) + let mut state = self.state.lock().unwrap(); + state.put(key, value) } fn put_with_extra( &self, - _key: &Path, - _value: Arc>, + key: &Path, + value: Arc>, _e: &Self::Extra, ) -> Option>> { - panic!("Not supported DefaultListFilesCache put_with_extra") + self.put(key, value) } fn remove(&self, k: &Path) -> Option>> { - self.statistics.remove(k).map(|x| x.1) + let mut state = self.state.lock().unwrap(); + state.remove(k) } fn contains_key(&self, k: &Path) -> bool { - self.statistics.contains_key(k) + let mut state = self.state.lock().unwrap(); + state.contains_key(k) } fn len(&self) -> usize { - self.statistics.len() + let state = self.state.lock().unwrap(); + state.len() } fn clear(&self) { - self.statistics.clear() + let mut state = self.state.lock().unwrap(); + state.clear(); } fn name(&self) -> String { - "DefaultListFilesCache".to_string() + String::from("DefaultListFilesCache") } } diff --git a/datafusion/execution/src/cache/mod.rs b/datafusion/execution/src/cache/mod.rs index 6e0737154dec..adc2e75bbab8 100644 --- a/datafusion/execution/src/cache/mod.rs +++ b/datafusion/execution/src/cache/mod.rs @@ -19,6 +19,7 @@ pub mod cache_manager; pub mod cache_unit; pub mod lru_queue; +// TODO: driveby-cleanup /// The cache accessor, users usually working on this interface while manipulating caches. /// This interface does not get `mut` references and thus has to handle its own /// locking via internal mutability. It can be accessed via multiple concurrent queries diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index d69987600855..e72f653e347a 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -33,12 +33,12 @@ use crate::cache::cache_manager::{CacheManager, CacheManagerConfig}; use crate::parquet_encryption::{EncryptionFactory, EncryptionFactoryRegistry}; use datafusion_common::{config::ConfigEntry, Result}; use object_store::ObjectStore; -use std::path::PathBuf; use std::sync::Arc; use std::{ fmt::{Debug, Formatter}, num::NonZeroUsize, }; +use std::{path::PathBuf, time::Duration}; use url::Url; #[derive(Clone)] @@ -294,6 +294,18 @@ impl RuntimeEnvBuilder { self } + /// Specify a limit on the number of objects to store in the object list cache + pub fn with_object_list_cache_limit(mut self, limit: usize) -> Self { + self.cache_manager = self.cache_manager.with_list_files_cache_limit(limit); + self + } + + /// Specify the duration entries in the object list cache will be considered valid + pub fn with_object_list_cache_ttl(mut self, ttl: Duration) -> Self { + self.cache_manager = self.cache_manager.with_list_files_cache_ttl(ttl); + self + } + /// Build a RuntimeEnv pub fn build(self) -> Result { let Self { @@ -335,6 +347,10 @@ impl RuntimeEnvBuilder { .cache_manager .get_file_statistic_cache(), list_files_cache: runtime_env.cache_manager.get_list_files_cache(), + list_files_cache_limit: runtime_env + .cache_manager + .get_list_files_cache_limit(), + list_files_cache_ttl: runtime_env.cache_manager.get_list_files_cache_ttl(), file_metadata_cache: Some( runtime_env.cache_manager.get_file_metadata_cache(), ),