diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py index d75bb873a..f71a821c1 100644 --- a/detect_secrets/core/baseline.py +++ b/detect_secrets/core/baseline.py @@ -5,6 +5,7 @@ from typing import cast from typing import Dict from typing import List +from typing import Optional from typing import Union from . import upgrades @@ -18,16 +19,22 @@ from .secrets_collection import SecretsCollection -def create(*paths: str, should_scan_all_files: bool = False, root: str = '') -> SecretsCollection: +def create( + *paths: str, + should_scan_all_files: bool = False, + root: str = '', + num_processors: Optional[int] = None, +) -> SecretsCollection: """Scans all the files recursively in path to initialize a baseline.""" - secrets = SecretsCollection(root=root) + kwargs = {} + if num_processors: + kwargs['num_processors'] = num_processors - for filename in get_files_to_scan( - *paths, - should_scan_all_files=should_scan_all_files, - root=root, - ): - secrets.scan_file(filename) + secrets = SecretsCollection(root=root) + secrets.scan_files( + *get_files_to_scan(*paths, should_scan_all_files=should_scan_all_files, root=root), + **kwargs, + ) return secrets diff --git a/detect_secrets/core/secrets_collection.py b/detect_secrets/core/secrets_collection.py index e30410d50..4e16a3752 100644 --- a/detect_secrets/core/secrets_collection.py +++ b/detect_secrets/core/secrets_collection.py @@ -1,3 +1,4 @@ +import multiprocessing as mp import os from collections import defaultdict from typing import Any @@ -45,6 +46,23 @@ def load_from_baseline(cls, baseline: Dict[str, Any]) -> 'SecretsCollection': def files(self) -> Set[str]: return set(self.data.keys()) + def scan_files(self, *filenames: str, num_processors: Optional[int] = None) -> None: + """Just like scan_file, but optimized through parallel processing.""" + if len(filenames) == 1: + self.scan_file(filenames[0]) + return + + if not num_processors: + num_processors = mp.cpu_count() + + with mp.Pool(processes=num_processors) as p: + for secrets in p.imap_unordered( + _scan_file_and_serialize, + [os.path.join(self.root, filename) for filename in filenames], + ): + for secret in secrets: + self[os.path.relpath(secret.filename, self.root)].add(secret) + def scan_file(self, filename: str) -> None: for secret in scan.scan_file(os.path.join(self.root, filename)): self[filename].add(secret) @@ -269,3 +287,8 @@ def __sub__(self, other: Any) -> 'SecretsCollection': output[filename] = self[filename] return output + + +def _scan_file_and_serialize(filename: str) -> List[PotentialSecret]: + """Used for multiprocessing, since lambdas can't be serialized.""" + return list(scan.scan_file(filename)) diff --git a/detect_secrets/core/usage/__init__.py b/detect_secrets/core/usage/__init__.py index 50ac3eada..ea7fee8ba 100644 --- a/detect_secrets/core/usage/__init__.py +++ b/detect_secrets/core/usage/__init__.py @@ -47,6 +47,18 @@ def add_default_options(self) -> 'ParserBuilder': 'working directory.' ), ) + self._parser.add_argument( + '-c', + '--cores', + dest='num_cores', + nargs=1, + type=int, + default=[None], + help=( + 'Specify the number of cores to use for parallel processing. Defaults to ' + 'using the max cores on the current host.' + ), + ) return self @@ -161,6 +173,8 @@ def parse_args(self, argv: Optional[List[str]] = None) -> argparse.Namespace: if args.path == ['.']: args.path = [args.custom_root] + args.num_cores = args.num_cores[0] + return args diff --git a/detect_secrets/main.py b/detect_secrets/main.py index ee4fb75e7..5040ef558 100644 --- a/detect_secrets/main.py +++ b/detect_secrets/main.py @@ -71,6 +71,7 @@ def handle_scan_action(args: argparse.Namespace) -> None: *args.path, should_scan_all_files=args.all_files, root=args.custom_root, + num_processors=args.num_cores, ) if args.baseline is not None: # The pre-commit hook's baseline upgrade is to trim the supplied baseline for non-existent