From 17f1ecd56e376a75d66e01be47029f1b1b29c0aa Mon Sep 17 00:00:00 2001 From: Thomas de Zeeuw Date: Tue, 20 Jun 2023 17:43:09 +0200 Subject: [PATCH] Open cache files in parallel (#5120) ## Summary Open cache files in parallel (again), brings the performance back to be roughly equal to the old implementation. ## Test Plan Existing tests should keep working. --- .github/workflows/ci.yaml | 3 + .pre-commit-config.yaml | 1 + crates/ruff/src/settings/mod.rs | 2 +- .../test/fixtures/cache_mutable/.gitignore | 2 + .../test/fixtures/cache_mutable/source.py | 4 + crates/ruff_cli/src/cache.rs | 363 +++++++++++++++--- crates/ruff_cli/src/commands/run.rs | 74 ++-- crates/ruff_cli/src/diagnostics.rs | 27 +- 8 files changed, 352 insertions(+), 124 deletions(-) create mode 100644 crates/ruff_cli/resources/test/fixtures/cache_mutable/.gitignore create mode 100644 crates/ruff_cli/resources/test/fixtures/cache_mutable/source.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 46ad6cd981f5a..6cae13b9e9341 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -76,6 +76,9 @@ jobs: cargo insta test --all --all-features git diff --exit-code - run: cargo test --package ruff_cli --test black_compatibility_test -- --ignored + # Skipped as it's currently broken. The resource were moved from the + # ruff_cli to ruff crate, but this test was not updated. + if: false # Check for broken links in the documentation. - run: cargo doc --all --no-deps env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 16ffe822863bc..3331ffbd7f5af 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,7 @@ exclude: | (?x)^( crates/ruff/resources/.*| crates/ruff/src/rules/.*/snapshots/.*| + crates/ruff_cli/resources/.*| crates/ruff_python_formatter/resources/.*| crates/ruff_python_formatter/src/snapshots/.* )$ diff --git a/crates/ruff/src/settings/mod.rs b/crates/ruff/src/settings/mod.rs index fb5d6f8894ab2..9a36b599080b4 100644 --- a/crates/ruff/src/settings/mod.rs +++ b/crates/ruff/src/settings/mod.rs @@ -40,7 +40,7 @@ pub mod types; const CARGO_PKG_VERSION: &str = env!("CARGO_PKG_VERSION"); -#[derive(Debug)] +#[derive(Debug, Default)] pub struct AllSettings { pub cli: CliSettings, pub lib: Settings, diff --git a/crates/ruff_cli/resources/test/fixtures/cache_mutable/.gitignore b/crates/ruff_cli/resources/test/fixtures/cache_mutable/.gitignore new file mode 100644 index 0000000000000..92d4d36a244eb --- /dev/null +++ b/crates/ruff_cli/resources/test/fixtures/cache_mutable/.gitignore @@ -0,0 +1,2 @@ +# Modified by the cache tests. +source.py diff --git a/crates/ruff_cli/resources/test/fixtures/cache_mutable/source.py b/crates/ruff_cli/resources/test/fixtures/cache_mutable/source.py new file mode 100644 index 0000000000000..7e397f06e5744 --- /dev/null +++ b/crates/ruff_cli/resources/test/fixtures/cache_mutable/source.py @@ -0,0 +1,4 @@ +# NOTE: sync with cache::invalidation test +a = 1 + +__all__ = list(["a", "b"]) diff --git a/crates/ruff_cli/src/cache.rs b/crates/ruff_cli/src/cache.rs index d52ade04eca7f..d543a8bf94c4d 100644 --- a/crates/ruff_cli/src/cache.rs +++ b/crates/ruff_cli/src/cache.rs @@ -6,11 +6,12 @@ use std::path::{Path, PathBuf}; use std::sync::Mutex; use std::time::SystemTime; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use ruff::message::Message; use ruff::settings::Settings; +use ruff::warn_user; use ruff_cache::{CacheKey, CacheKeyHasher}; use ruff_diagnostics::{DiagnosticKind, Fix}; use ruff_python_ast::imports::ImportMap; @@ -19,33 +20,45 @@ use ruff_text_size::{TextRange, TextSize}; use crate::diagnostics::Diagnostics; -/// On disk representation of a cache of a package. -#[derive(Deserialize, Debug, Serialize)] -pub(crate) struct PackageCache { +/// [`Path`] that is relative to the package root in [`PackageCache`]. +pub(crate) type RelativePath = Path; +/// [`PathBuf`] that is relative to the package root in [`PackageCache`]. +pub(crate) type RelativePathBuf = PathBuf; + +/// Cache. +/// +/// `Cache` holds everything required to display the diagnostics for a single +/// package. The on-disk representation is represented in [`PackageCache`] (and +/// related) types. +/// +/// This type manages the cache file, reading it from disk and writing it back +/// to disk (if required). +pub(crate) struct Cache { /// Location of the cache. - /// - /// Not stored on disk, just used as a storage location. - #[serde(skip)] path: PathBuf, - /// Path to the root of the package. + /// Package cache read from disk. + package: PackageCache, + /// Changes made compared to the (current) `package`. /// - /// Usually this is a directory, but it can also be a single file in case of - /// single file "packages", e.g. scripts. - package_root: PathBuf, - /// Mapping of source file path to it's cached data. - // TODO: look into concurrent hashmap or similar instead of a mutex. - files: Mutex>, + /// Files that are linted, but are not in `package.files` or are in + /// `package.files` but are outdated. This gets merged with `package.files` + /// when the cache is written back to disk in [`Cache::store`]. + new_files: Mutex>, } -impl PackageCache { - /// Open or create a new package cache. +impl Cache { + /// Open or create a new cache. + /// + /// `cache_dir` is considered the root directory of the cache, which can be + /// local to the project, global or otherwise set by the user. + /// + /// `package_root` is the path to root of the package that is contained + /// within this cache and must be canonicalized (to avoid considering `./` + /// and `../project` being different). /// - /// `package_root` must be canonicalized. - pub(crate) fn open( - cache_dir: &Path, - package_root: PathBuf, - settings: &Settings, - ) -> Result { + /// Finally `settings` is used to ensure we don't open a cache for different + /// settings. + pub(crate) fn open(cache_dir: &Path, package_root: PathBuf, settings: &Settings) -> Cache { debug_assert!(package_root.is_absolute(), "package root not canonicalized"); let mut buf = itoa::Buffer::new(); @@ -56,40 +69,66 @@ impl PackageCache { Ok(file) => file, Err(err) if err.kind() == io::ErrorKind::NotFound => { // No cache exist yet, return an empty cache. - return Ok(PackageCache { - path, - package_root, - files: Mutex::new(HashMap::new()), - }); + return Cache::empty(path, package_root); } Err(err) => { - return Err(err) - .with_context(|| format!("Failed to open cache file '{}'", path.display()))? + warn_user!("Failed to open cache file '{}': {err}", path.display()); + return Cache::empty(path, package_root); } }; - let mut cache: PackageCache = bincode::deserialize_from(BufReader::new(file)) - .with_context(|| format!("Failed parse cache file '{}'", path.display()))?; + let mut package: PackageCache = match bincode::deserialize_from(BufReader::new(file)) { + Ok(package) => package, + Err(err) => { + warn_user!("Failed parse cache file '{}': {err}", path.display()); + return Cache::empty(path, package_root); + } + }; // Sanity check. - if cache.package_root != package_root { - return Err(anyhow!( + if package.package_root != package_root { + warn_user!( "Different package root in cache: expected '{}', got '{}'", package_root.display(), - cache.package_root.display(), - )); + package.package_root.display(), + ); + package.files.clear(); } + Cache { + path, + package, + new_files: Mutex::new(HashMap::new()), + } + } - cache.path = path; - Ok(cache) + /// Create an empty `Cache`. + fn empty(path: PathBuf, package_root: PathBuf) -> Cache { + Cache { + path, + package: PackageCache { + package_root, + files: HashMap::new(), + }, + new_files: Mutex::new(HashMap::new()), + } } - /// Store the cache to disk. - pub(crate) fn store(&self) -> Result<()> { + /// Store the cache to disk, if it has been changed. + pub(crate) fn store(mut self) -> Result<()> { + let new_files = self.new_files.into_inner().unwrap(); + if new_files.is_empty() { + // No changes made, no need to write the same cache file back to + // disk. + return Ok(()); + } + + // Add/overwrite the changes made. + self.package.files.extend(new_files); + let file = File::create(&self.path) .with_context(|| format!("Failed to create cache file '{}'", self.path.display()))?; let writer = BufWriter::new(file); - bincode::serialize_into(writer, &self).with_context(|| { + bincode::serialize_into(writer, &self.package).with_context(|| { format!( "Failed to serialise cache to file '{}'", self.path.display() @@ -101,7 +140,7 @@ impl PackageCache { /// /// Returns `None` if `path` is not within the package. pub(crate) fn relative_path<'a>(&self, path: &'a Path) -> Option<&'a RelativePath> { - path.strip_prefix(&self.package_root).ok() + path.strip_prefix(&self.package.package_root).ok() } /// Get the cached results for a single file at relative `path`. This uses @@ -114,33 +153,34 @@ impl PackageCache { &self, path: &RelativePath, file_last_modified: SystemTime, - ) -> Option { - let files = self.files.lock().unwrap(); - let file = files.get(path)?; + ) -> Option<&FileCache> { + let file = self.package.files.get(path)?; // Make sure the file hasn't changed since the cached run. if file.last_modified != file_last_modified { return None; } - Some(file.clone()) + Some(file) } /// Add or update a file cache at `path` relative to the package root. pub(crate) fn update(&self, path: RelativePathBuf, file: FileCache) { - self.files.lock().unwrap().insert(path, file); - } - - /// Remove a file cache at `path` relative to the package root. - pub(crate) fn remove(&self, path: &RelativePath) { - self.files.lock().unwrap().remove(path); + self.new_files.lock().unwrap().insert(path, file); } } -/// [`Path`] that is relative to the package root in [`PackageCache`]. -pub(crate) type RelativePath = Path; -/// [`PathBuf`] that is relative to the package root in [`PackageCache`]. -pub(crate) type RelativePathBuf = PathBuf; +/// On disk representation of a cache of a package. +#[derive(Deserialize, Debug, Serialize)] +struct PackageCache { + /// Path to the root of the package. + /// + /// Usually this is a directory, but it can also be a single file in case of + /// single file "packages", e.g. scripts. + package_root: PathBuf, + /// Mapping of source file path to it's cached data. + files: HashMap, +} /// On disk representation of the cache per source file. #[derive(Clone, Deserialize, Debug, Serialize)] @@ -198,23 +238,23 @@ impl FileCache { } /// Convert the file cache into `Diagnostics`, using `path` as file name. - pub(crate) fn into_diagnostics(self, path: &Path) -> Diagnostics { + pub(crate) fn as_diagnostics(&self, path: &Path) -> Diagnostics { let messages = if self.messages.is_empty() { Vec::new() } else { - let file = SourceFileBuilder::new(path.to_string_lossy(), self.source).finish(); + let file = SourceFileBuilder::new(path.to_string_lossy(), &*self.source).finish(); self.messages - .into_iter() + .iter() .map(|msg| Message { - kind: msg.kind, + kind: msg.kind.clone(), range: msg.range, - fix: msg.fix, + fix: msg.fix.clone(), file: file.clone(), noqa_offset: msg.noqa_offset, }) .collect() }; - Diagnostics::new(messages, self.imports) + Diagnostics::new(messages, self.imports.clone()) } } @@ -257,3 +297,204 @@ pub(crate) fn init(path: &Path) -> Result<()> { Ok(()) } + +#[cfg(test)] +mod test { + use std::env::temp_dir; + use std::fs; + use std::io::{self, Write}; + use std::path::Path; + + use ruff::settings::{flags, AllSettings}; + use ruff_cache::CACHE_DIR_NAME; + + use crate::cache::{self, Cache}; + use crate::diagnostics::{lint_path, Diagnostics}; + + #[test] + fn same_results() { + let mut cache_dir = temp_dir(); + cache_dir.push("ruff_tests/cache_same_results"); + let _ = fs::remove_dir_all(&cache_dir); + cache::init(&cache_dir).unwrap(); + + let settings = AllSettings::default(); + + let package_root = fs::canonicalize("../ruff/resources/test/fixtures").unwrap(); + let cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib); + assert_eq!(cache.new_files.lock().unwrap().len(), 0); + + let mut paths = Vec::new(); + let mut parse_errors = Vec::new(); + let mut expected_diagnostics = Diagnostics::default(); + for entry in fs::read_dir(&package_root).unwrap() { + let entry = entry.unwrap(); + if !entry.file_type().unwrap().is_dir() { + continue; + } + + let dir_path = entry.path(); + if dir_path.ends_with(CACHE_DIR_NAME) { + continue; + } + + for entry in fs::read_dir(dir_path).unwrap() { + let entry = entry.unwrap(); + if !entry.file_type().unwrap().is_file() { + continue; + } + + let path = entry.path(); + if path.ends_with("pyproject.toml") || path.ends_with("R.ipynb") { + continue; + } + + let diagnostics = lint_path( + &path, + Some(&package_root), + &settings, + Some(&cache), + flags::Noqa::Enabled, + flags::FixMode::Generate, + ) + .unwrap(); + if diagnostics + .messages + .iter() + .any(|m| m.kind.name == "SyntaxError") + { + parse_errors.push(path.clone()); + } + paths.push(path); + expected_diagnostics += diagnostics; + } + } + assert_ne!(paths, &[] as &[std::path::PathBuf], "no files checked"); + + cache.store().unwrap(); + + let cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib); + assert_ne!(cache.package.files.len(), 0); + + parse_errors.sort(); + + for path in &paths { + if parse_errors.binary_search(path).is_ok() { + continue; // We don't cache parsing errors. + } + + let relative_path = cache.relative_path(path).unwrap(); + + assert!( + cache.package.files.contains_key(relative_path), + "missing file from cache: '{}'", + relative_path.display() + ); + } + + let mut got_diagnostics = Diagnostics::default(); + for path in paths { + got_diagnostics += lint_path( + &path, + Some(&package_root), + &settings, + Some(&cache), + flags::Noqa::Enabled, + flags::FixMode::Generate, + ) + .unwrap(); + } + + // Not stored in the cache. + expected_diagnostics.source_kind.clear(); + got_diagnostics.source_kind.clear(); + assert!(expected_diagnostics == got_diagnostics); + } + + #[test] + fn invalidation() { + // NOTE: keep in sync with actual file. + const SOURCE: &[u8] = b"# NOTE: sync with cache::invalidation test\na = 1\n\n__all__ = list([\"a\", \"b\"])\n"; + + let mut cache_dir = temp_dir(); + cache_dir.push("ruff_tests/cache_invalidation"); + let _ = fs::remove_dir_all(&cache_dir); + cache::init(&cache_dir).unwrap(); + + let settings = AllSettings::default(); + let package_root = fs::canonicalize("resources/test/fixtures/cache_mutable").unwrap(); + let cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib); + assert_eq!(cache.new_files.lock().unwrap().len(), 0); + + let path = package_root.join("source.py"); + let mut expected_diagnostics = lint_path( + &path, + Some(&package_root), + &settings, + Some(&cache), + flags::Noqa::Enabled, + flags::FixMode::Generate, + ) + .unwrap(); + assert_eq!(cache.new_files.lock().unwrap().len(), 1); + + cache.store().unwrap(); + + let tests = [ + // File change. + (|path| { + let mut file = fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(path)?; + file.write_all(SOURCE)?; + file.sync_data()?; + Ok(|_| Ok(())) + }) as fn(&Path) -> io::Result io::Result<()>>, + // Regression for issue #3086. + #[cfg(unix)] + |path| { + flip_execute_permission_bit(path)?; + Ok(flip_execute_permission_bit) + }, + ]; + + #[cfg(unix)] + #[allow(clippy::items_after_statements)] + fn flip_execute_permission_bit(path: &Path) -> io::Result<()> { + use std::os::unix::fs::PermissionsExt; + let file = fs::OpenOptions::new().write(true).open(path)?; + let perms = file.metadata()?.permissions(); + file.set_permissions(PermissionsExt::from_mode(perms.mode() ^ 0o111)) + } + + for change_file in tests { + let cleanup = change_file(&path).unwrap(); + + let cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib); + + let mut got_diagnostics = lint_path( + &path, + Some(&package_root), + &settings, + Some(&cache), + flags::Noqa::Enabled, + flags::FixMode::Generate, + ) + .unwrap(); + + cleanup(&path).unwrap(); + + assert_eq!( + cache.new_files.lock().unwrap().len(), + 1, + "cache must not be used" + ); + + // Not store in the cache. + expected_diagnostics.source_kind.clear(); + got_diagnostics.source_kind.clear(); + assert!(expected_diagnostics == got_diagnostics); + } + } +} diff --git a/crates/ruff_cli/src/commands/run.rs b/crates/ruff_cli/src/commands/run.rs index 62a37352f5113..9ff134f2c96d9 100644 --- a/crates/ruff_cli/src/commands/run.rs +++ b/crates/ruff_cli/src/commands/run.rs @@ -1,4 +1,4 @@ -use std::collections::{hash_map, HashMap}; +use std::collections::HashMap; use std::fmt::Write; use std::io; use std::path::{Path, PathBuf}; @@ -22,7 +22,7 @@ use ruff_python_ast::imports::ImportMap; use ruff_python_ast::source_code::SourceFileBuilder; use crate::args::Overrides; -use crate::cache::{self, PackageCache}; +use crate::cache::{self, Cache}; use crate::diagnostics::Diagnostics; use crate::panic::catch_unwind; @@ -77,37 +77,21 @@ pub(crate) fn run( pyproject_config, ); - // Create a cache per package, if enabled. - let package_caches = if cache.into() { - let mut caches = HashMap::new(); - // TODO(thomas): try to merge this with the detection of package roots - // above or with the parallel iteration below. - for entry in &paths { - let Ok(entry) = entry else { continue }; - let path = entry.path(); - let package = path - .parent() - .and_then(|parent| package_roots.get(parent)) - .and_then(|package| *package); - // For paths not in a package, e.g. scripts, we use the path as - // the package root. - let package_root = package.unwrap_or(path); - - let settings = resolver.resolve_all(path, pyproject_config); - - if let hash_map::Entry::Vacant(entry) = caches.entry(package_root) { - let cache = PackageCache::open( + // Load the caches. + let caches = bool::from(cache).then(|| { + package_roots + .par_iter() + .map(|(package_root, _)| { + let settings = resolver.resolve_all(package_root, pyproject_config); + let cache = Cache::open( &settings.cli.cache_dir, - package_root.to_owned(), + package_root.to_path_buf(), &settings.lib, - )?; - entry.insert(cache); - } - } - Some(caches) - } else { - None - }; + ); + (&**package_root, cache) + }) + .collect::>() + }); let start = Instant::now(); let mut diagnostics: Diagnostics = paths @@ -121,17 +105,13 @@ pub(crate) fn run( .and_then(|parent| package_roots.get(parent)) .and_then(|package| *package); - let package_cache = package_caches.as_ref().map(|package_caches| { - let package_root = package.unwrap_or(path); - let package_cache = package_caches - .get(package_root) - .expect("failed to get package cache"); - package_cache - }); - let settings = resolver.resolve_all(path, pyproject_config); + let package_root = package.unwrap_or_else(|| path.parent().unwrap_or(path)); + let cache = caches + .as_ref() + .map(|caches| caches.get(&package_root).unwrap()); - lint_path(path, package, settings, package_cache, noqa, autofix).map_err(|e| { + lint_path(path, package, settings, cache, noqa, autofix).map_err(|e| { (Some(path.to_owned()), { let mut error = e.to_string(); for cause in e.chain() { @@ -188,11 +168,11 @@ pub(crate) fn run( diagnostics.messages.sort(); - // Store the package caches. - if let Some(package_caches) = package_caches { - for package_cache in package_caches.values() { - package_cache.store()?; - } + // Store the caches. + if let Some(caches) = caches { + caches + .into_par_iter() + .try_for_each(|(_, cache)| cache.store())?; } let duration = start.elapsed(); @@ -207,12 +187,12 @@ fn lint_path( path: &Path, package: Option<&Path>, settings: &AllSettings, - package_cache: Option<&PackageCache>, + cache: Option<&Cache>, noqa: flags::Noqa, autofix: flags::FixMode, ) -> Result { let result = catch_unwind(|| { - crate::diagnostics::lint_path(path, package, settings, package_cache, noqa, autofix) + crate::diagnostics::lint_path(path, package, settings, cache, noqa, autofix) }); match result { diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs index 68c110851e4a9..7fabf261414cb 100644 --- a/crates/ruff_cli/src/diagnostics.rs +++ b/crates/ruff_cli/src/diagnostics.rs @@ -25,7 +25,7 @@ use ruff_python_ast::imports::ImportMap; use ruff_python_ast::source_code::{LineIndex, SourceCode, SourceFileBuilder}; use ruff_python_stdlib::path::is_project_toml; -use crate::cache::{FileCache, PackageCache}; +use crate::cache::{Cache, FileCache}; #[derive(Debug, Default, PartialEq)] pub(crate) struct Diagnostics { @@ -100,7 +100,7 @@ pub(crate) fn lint_path( path: &Path, package: Option<&Path>, settings: &AllSettings, - package_cache: Option<&PackageCache>, + cache: Option<&Cache>, noqa: flags::Noqa, autofix: flags::FixMode, ) -> Result { @@ -110,17 +110,17 @@ pub(crate) fn lint_path( // to cache `fixer::Mode::Apply`, since a file either has no fixes, or we'll // write the fixes to disk, thus invalidating the cache. But it's a bit hard // to reason about. We need to come up with a better solution here.) - let caching = match package_cache { - Some(package_cache) if noqa.into() && autofix.is_generate() => { - let relative_path = package_cache + let caching = match cache { + Some(cache) if noqa.into() && autofix.is_generate() => { + let relative_path = cache .relative_path(path) .expect("wrong package cache for file"); let last_modified = path.metadata()?.modified()?; - if let Some(cache) = package_cache.get(relative_path, last_modified) { - return Ok(cache.into_diagnostics(path)); + if let Some(cache) = cache.get(relative_path, last_modified) { + return Ok(cache.as_diagnostics(path)); } - Some((package_cache, relative_path, last_modified)) + Some((cache, relative_path, last_modified)) } _ => None, }; @@ -207,14 +207,11 @@ pub(crate) fn lint_path( let imports = imports.unwrap_or_default(); - if let Some((package_cache, relative_path, file_last_modified)) = caching { - if parse_error.is_some() { - // We don't cache parsing error, so we remove the old file cache (if - // any). - package_cache.remove(relative_path); - } else { + if let Some((cache, relative_path, file_last_modified)) = caching { + if parse_error.is_none() { + // We don't cache parsing error. let file_cache = FileCache::new(file_last_modified, &messages, &imports); - package_cache.update(relative_path.to_owned(), file_cache); + cache.update(relative_path.to_owned(), file_cache); } }