From 2a063928161298cb5db8792de327c8c9d892ae92 Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Thu, 4 Mar 2021 07:53:03 -0800 Subject: [PATCH] adding gibberish detector --- .pre-commit-config.yaml | 6 +- .secrets.baseline | 56 ++---------- MANIFEST.in | 1 + README.md | 65 ++++++++++--- detect_secrets/core/baseline.py | 3 + detect_secrets/core/usage/filters.py | 23 +++++ detect_secrets/filters/__init__.py | 1 + detect_secrets/filters/gibberish/__init__.py | 93 +++++++++++++++++++ detect_secrets/filters/gibberish/rfc.model | 1 + detect_secrets/filters/util.py | 25 +++++ detect_secrets/filters/wordlist.py | 22 +---- detect_secrets/plugins/keyword.py | 96 +------------------- detect_secrets/settings.py | 13 +++ docs/filters.md | 1 + requirements-dev-minimal.txt | 1 + requirements-dev.txt | 1 + setup.py | 4 + testing/mocks.py | 16 ++++ tests/filters/common_filter_test.py | 7 +- tests/filters/gibberish_filter_test.py | 61 +++++++++++++ tests/main_test.py | 3 +- tests/pre_commit_hook_test.py | 45 ++++----- 22 files changed, 338 insertions(+), 206 deletions(-) create mode 100644 MANIFEST.in create mode 100644 detect_secrets/filters/gibberish/__init__.py create mode 100644 detect_secrets/filters/gibberish/rfc.model create mode 100644 tests/filters/gibberish_filter_test.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b117bc416..87078a5de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,8 +29,10 @@ repos: rev: v1.4.4 hooks: - id: autopep8 -- repo: https://github.com/Yelp/detect-secrets - rev: v1.0.1 +- repo: local hooks: - id: detect-secrets + name: Detect secrets + language: python + entry: detect-secrets-hook args: ['--baseline', '.secrets.baseline'] diff --git a/.secrets.baseline b/.secrets.baseline index 60ba42835..28f332244 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,5 +1,5 @@ { - "version": "1.0.1", + "version": "1.0.3", "plugins_used": [ { "name": "ArtifactoryDetector" @@ -74,6 +74,10 @@ "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", "min_level": 2 }, + { + "path": "detect_secrets.filters.gibberish.should_exclude_secret", + "limit": 3.7 + }, { "path": "detect_secrets.filters.heuristic.is_indirect_reference" }, @@ -100,38 +104,6 @@ } ], "results": { - "README.md": [ - { - "type": "Secret Keyword", - "filename": "README.md", - "hashed_secret": "25d176b9bc8c2a063e8319e044bd127b49a15755", - "is_verified": false, - "line_number": 483 - } - ], - "detect_secrets/plugins/keyword.py": [ - { - "type": "Secret Keyword", - "filename": "detect_secrets/plugins/keyword.py", - "hashed_secret": "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33", - "is_verified": false, - "line_number": 178 - }, - { - "type": "Secret Keyword", - "filename": "detect_secrets/plugins/keyword.py", - "hashed_secret": "62cdb7020ff920e5aa642c3d4066950dd1f01f4d", - "is_verified": false, - "line_number": 189 - }, - { - "type": "Secret Keyword", - "filename": "detect_secrets/plugins/keyword.py", - "hashed_secret": "1af17e73721dbe0c40011b82ed4bb1a7dbe3ce29", - "is_verified": false, - "line_number": 223 - } - ], "detect_secrets/plugins/private_key.py": [ { "type": "Private Key", @@ -230,13 +202,6 @@ "is_verified": false, "line_number": 53 }, - { - "type": "Secret Keyword", - "filename": "docs/design.md", - "hashed_secret": "fc782b0875be9e076d80f5da1430d6ea501c87e5", - "is_verified": false, - "line_number": 54 - }, { "type": "Base64 High Entropy String", "filename": "docs/design.md", @@ -244,16 +209,7 @@ "is_verified": false, "line_number": 200 } - ], - "docs/filters.md": [ - { - "type": "Secret Keyword", - "filename": "docs/filters.md", - "hashed_secret": "4566d0493d8a9b7a811728e852ed5df95fa70dd2", - "is_verified": false, - "line_number": 55 - } ] }, - "generated_at": "2021-02-25T19:11:59Z" + "generated_at": "2021-03-04T15:43:23Z" } diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..833b8ca2c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include detect_secrets/filters/gibberish/rfc.model diff --git a/README.md b/README.md index 60290ffa5..cc2e9c615 100644 --- a/README.md +++ b/README.md @@ -356,9 +356,6 @@ filter options: If filenames match this regex, it will be ignored. --exclude-secrets EXCLUDE_SECRETS If secrets match this regex, it will be ignored. - --word-list WORD_LIST_FILE - Text file with a list of words, if a secret contains a - word in the list we ignore it. -f FILTER, --filter FILTER Specify path to custom filter. May be a python module path (e.g. @@ -513,17 +510,6 @@ Or you can specify multiple regex rules as such: $ detect-secrets scan --exclude-secrets 'fakesecret' --exclude-secrets '\${.*})' ``` -#### --word-list - -If you know there are certain fake password values that you want to ignore, you can also use -this option: - -```bash -$ cat wordlist.txt -not-a-real-secret -$ detect-secrets scan --word-list wordlist.txt -``` - #### Inline Allowlisting Sometimes, you want to apply an exclusion to a specific line, rather than globally excluding it. @@ -556,6 +542,57 @@ $ detect-secrets scan --only-allowlisted Want to write more custom logic to filter out false positives? Check out how to do this in our [filters documentation](docs/filters.md#Using-Your-Own-Filters). +## Extensions + +### wordlist + +The `--exclude-secrets` flag allows you to specify regex rules to exclude secret values. However, +if you want to specify a large list of words instead, you can use the `--word-list` flag. + +To use this feature, be sure to install the `pyahocorasick` package, or simply use: + +```bash +$ pip install detect-secrets[word_list] +``` + +Then, you can use it as such: + +```bash +$ cat wordlist.txt +not-a-real-secret +$ cat sample.ini +password = not-a-real-secret + +# Will show results +$ detect-secrets scan sample.ini + +# No results found +$ detect-secrets scan --word-list wordlist.txt +``` + +### Gibberish Detector + +The Gibberish Detector is a simple ML model, that attempts to determine whether a secret value +is actually gibberish, with the assumption that **real** secret values are not word-like. + +To use this feature, be sure to install the `gibberish-detector` package, or use: + +```bash +$ pip install detect-secrets[gibberish] +``` + +Check out the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package for +more information on how to train the model. A pre-trained model (seeded by processing RFCs) will +be included for easy use. + +You can also specify your own model as such: + +```bash +$ detect-secrets scan --gibberish-model custom.model +``` + +This is not a default plugin, given that this will ignore secrets such as `password`. + ## Caveats This is not meant to be a sure-fire solution to prevent secrets from entering the codebase. Only diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py index f8afeb1a0..e4018d7b6 100644 --- a/detect_secrets/core/baseline.py +++ b/detect_secrets/core/baseline.py @@ -90,6 +90,9 @@ def save_to_file( If you're trying to decide the difference, ask yourself whether there are any changes that does not directly impact the results of the scan. """ + # TODO: I wonder whether this should add the `detect_secrets.filters.common.is_baseline_file` + # filter, since we know the filename already. However, one could argue that it would cause + # this function to "do more than one thing". output = secrets if isinstance(secrets, SecretsCollection): output = format_for_output(secrets) diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py index 82c8e9809..c27ce0843 100644 --- a/detect_secrets/core/usage/filters.py +++ b/detect_secrets/core/usage/filters.py @@ -65,6 +65,19 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None: dest='word_list_file', ) + if filters.gibberish.is_feature_enabled(): + parser.add_argument( + '--gibberish-model', + type=valid_path, + help='Path to model trained with gibberish-detector.', + dest='gibberish_model_file', + ) + parser.add_argument( + '--gibberish-limit', + type=float, + help='Threshold to determine whether a string is gibberish.', + ) + _add_custom_filters(parser) _add_disable_flag(parser) @@ -145,6 +158,16 @@ def parse_args(args: argparse.Namespace) -> None: ): filters.wordlist.initialize(args.word_list_file) + if filters.gibberish.is_feature_enabled(): + kwargs = {} + if args.gibberish_model_file: + kwargs['model_path'] = args.gibberish_model_file + + if args.gibberish_limit: + kwargs['limit'] = args.gibberish_limit + + filters.gibberish.initialize(**kwargs) + if not args.no_verify: get_settings().filters[ 'detect_secrets.filters.common.is_ignored_due_to_verification_policies' diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py index e44d782c1..bda705e98 100644 --- a/detect_secrets/filters/__init__.py +++ b/detect_secrets/filters/__init__.py @@ -1,4 +1,5 @@ from . import allowlist # noqa: F401 +from . import gibberish # noqa: F401 from . import heuristic # noqa: F401 from . import regex # noqa: F401 from . import wordlist # noqa: F401 diff --git a/detect_secrets/filters/gibberish/__init__.py b/detect_secrets/filters/gibberish/__init__.py new file mode 100644 index 000000000..17384ea5a --- /dev/null +++ b/detect_secrets/filters/gibberish/__init__.py @@ -0,0 +1,93 @@ +import os +import string +from functools import lru_cache +from typing import Any +from typing import Optional + +from ...core.plugins import Plugin +from ...plugins.private_key import PrivateKeyDetector +from ...settings import get_settings +from ..util import compute_file_hash + + +Model = Any + + +def is_feature_enabled() -> bool: + try: + get_model() + return True + except ImportError: + return False + + +def initialize(model_path: Optional[str] = None, limit: float = 3.7) -> None: + """ + :param limit: this limit was obtained through trial and error. Check out + the original pull request for rationale. + + :raises: ValueError + """ + path = model_path + if not path: + path = os.path.join(__path__[0], 'rfc.model') + + model = get_model() + + from gibberish_detector import serializer + from gibberish_detector.exceptions import ParsingError + with open(path) as f: + try: + model.update(serializer.deserialize(f.read())) + except ParsingError: + raise ValueError('Invalid model.') + + config = { + 'limit': limit, + } + if model_path: + config['model'] = model_path + config['file_hash'] = compute_file_hash(model_path) + + path = f'{__name__}.should_exclude_secret' + get_settings().filters[path] = config + + +def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool: + """ + :param plugin: optional, for easier testing. The dependency injection system + will populate its proper value on complete runs. + """ + # Private keys are actual words, so they will be a false negative. + if isinstance(plugin, PrivateKeyDetector): + return False + + # Through real-life experimentation, we discovered that the gibberish detector + # works best with non-hex strings, since hex strings have a too limited charset + # to fit our trained models. As such, we cannot make a deterministic decision + # in such cases. + if not (set(secret) - set(string.hexdigits + '-')): + return False + + if not get_model().data or not get_model().charset: + raise AssertionError('Attempting to use uninitialized gibberish model.') + + from gibberish_detector.detector import Detector + detector = Detector( + model=get_model(), + threshold=get_settings().filters[f'{__name__}.should_exclude_secret']['limit'], + ) + + # TODO: secret.lower() is only used currently, since the default model is only + # trained with lower case letters. However, in the future, if people want to train + # a model that is case-sensitive, we can figure out how to change this. + # Unfortunately, it's not straight-forward to just remove the `.lower()` function call, + # since if the string is *not* lowered (and the model expects it to be), the results + # will be quite different. + return not detector.is_gibberish(secret.lower()) + + +@lru_cache(maxsize=1) +def get_model() -> 'Model': + from gibberish_detector.model import Model + return Model(charset='') diff --git a/detect_secrets/filters/gibberish/rfc.model b/detect_secrets/filters/gibberish/rfc.model new file mode 100644 index 000000000..0676ed2a1 --- /dev/null +++ b/detect_secrets/filters/gibberish/rfc.model @@ -0,0 +1 @@ +{"charset": "abcdefghijklmnopqrstuvwxyz", "ngram_size": 2, "counts": {"a": {"a": 118635, "b": 630875, "c": 1501150, "d": 938413, "e": 64942, "f": 203534, "g": 892717, "h": 50302, "i": 629589, "j": 16393, "k": 122527, "l": 2134199, "m": 865412, "n": 3466183, "o": 39385, "p": 708687, "q": 15378, "r": 1990681, "s": 1639856, "t": 4062626, "u": 390482, "v": 290066, "w": 65833, "x": 148257, "y": 497074, "z": 9679}, "b": {"a": 299747, "b": 38842, "c": 66484, "d": 26575, "e": 1340130, "f": 29026, "g": 32212, "h": 8693, "i": 355357, "j": 188222, "k": 6350, "l": 528022, "m": 42838, "n": 36585, "o": 271797, "p": 20216, "q": 1650, "r": 143544, "s": 134924, "t": 55332, "u": 282480, "v": 8513, "w": 9195, "x": 4962, "y": 350774, "z": 1991}, "c": {"a": 1583493, "b": 55072, "c": 366166, "d": 99397, "e": 1680049, "f": 61790, "g": 22751, "h": 1138161, "i": 647914, "j": 23415, "k": 608337, "l": 409128, "m": 129894, "n": 63692, "o": 2266996, "p": 202574, "q": 8192, "r": 525914, "s": 234381, "t": 1400393, "u": 582740, "v": 25213, "w": 27222, "x": 12164, "y": 94929, "z": 3613}, "d": {"a": 1102196, "b": 395535, "c": 205472, "d": 521902, "e": 2349496, "f": 230444, "g": 90095, "h": 107584, "i": 1575337, "j": 30840, "k": 23781, "l": 151769, "m": 188851, "n": 183549, "o": 773165, "p": 252828, "q": 10114, "r": 490139, "s": 679426, "t": 677199, "u": 321170, "v": 101547, "w": 168753, "x": 13779, "y": 59907, "z": 4936}, "e": {"a": 1743853, "b": 293141, "c": 2504452, "d": 3070491, "e": 835273, "f": 950803, "g": 431308, "h": 176469, "i": 1177449, "j": 38230, "k": 70232, "l": 1227191, "m": 1322498, "n": 4007948, "o": 705920, "p": 939044, "q": 450185, "r": 5234236, "s": 4637943, "t": 2734401, "u": 284887, "v": 537744, "w": 395249, "x": 721758, "y": 247129, "z": 15923}, "f": {"a": 482902, "b": 46301, "c": 776635, "d": 74453, "e": 459377, "f": 324747, "g": 15818, "h": 24876, "i": 1326013, "j": 5135, "k": 7786, "l": 154253, "m": 68036, "n": 47090, "o": 1525315, "p": 71256, "q": 7352, "r": 399118, "s": 158690, "t": 743932, "u": 192428, "v": 25544, "w": 23974, "x": 11395, "y": 80258, "z": 5178}, "g": {"a": 374124, "b": 43000, "c": 75143, "d": 54891, "e": 1210541, "f": 61951, "g": 89766, "h": 265348, "i": 434656, "j": 9650, "k": 8369, "l": 111072, "m": 141157, "n": 262172, "o": 233776, "p": 126045, "q": 3985, "r": 368752, "s": 199803, "t": 394592, "u": 242465, "v": 16518, "w": 40581, "x": 7464, "y": 46646, "z": 4033}, "h": {"a": 1557099, "b": 48781, "c": 89817, "d": 47268, "e": 4944439, "f": 32193, "g": 12079, "h": 20750, "i": 1105135, "j": 3953, "k": 9651, "l": 32170, "m": 125637, "n": 76152, "o": 775525, "p": 47687, "q": 3047, "r": 145761, "s": 83710, "t": 413869, "u": 73213, "v": 16513, "w": 22152, "x": 4223, "y": 37868, "z": 6960}, "i": {"a": 545417, "b": 441765, "c": 1649054, "d": 823145, "e": 1026129, "f": 883435, "g": 624619, "h": 16297, "i": 47452, "j": 11907, "k": 56496, "l": 788318, "m": 672818, "n": 5188085, "o": 2696322, "p": 641038, "q": 31721, "r": 507827, "s": 2474217, "t": 2145858, "u": 27798, "v": 521907, "w": 8937, "x": 103556, "y": 3155, "z": 178043}, "j": {"a": 65309, "b": 5150, "c": 4630, "d": 3609, "e": 207116, "f": 2967, "g": 2140, "h": 3674, "i": 12565, "j": 1966, "k": 4624, "l": 3278, "m": 6958, "n": 5271, "o": 59640, "p": 7497, "q": 1402, "r": 4799, "s": 15013, "t": 4209, "u": 80334, "v": 2082, "w": 6849, "x": 1481, "y": 1382, "z": 1520}, "k": {"a": 91393, "b": 24180, "c": 35534, "d": 25923, "e": 557437, "f": 32487, "g": 8856, "h": 16581, "i": 173899, "j": 5028, "k": 9027, "l": 40471, "m": 36600, "n": 89335, "o": 53389, "p": 133808, "q": 2485, "r": 28645, "s": 177604, "t": 62173, "u": 27442, "v": 5748, "w": 27477, "x": 2830, "y": 6013, "z": 1805}, "l": {"a": 929370, "b": 103217, "c": 139662, "d": 512423, "e": 1824337, "f": 113308, "g": 78226, "h": 36656, "i": 1450931, "j": 14335, "k": 24164, "l": 983903, "m": 107923, "n": 110976, "o": 905778, "p": 211582, "q": 5416, "r": 114840, "s": 608836, "t": 453915, "u": 460820, "v": 96718, "w": 49654, "x": 6631, "y": 577831, "z": 11631}, "m": {"a": 1635769, "b": 287867, "c": 67799, "d": 50279, "e": 2204807, "f": 38281, "g": 16738, "h": 16864, "i": 672936, "j": 6422, "k": 6652, "l": 90860, "m": 268828, "n": 40050, "o": 527359, "p": 791104, "q": 3281, "r": 47377, "s": 279178, "t": 188486, "u": 454062, "v": 16237, "w": 25264, "x": 6360, "y": 9965, "z": 2639}, "n": {"a": 1538477, "b": 200065, "c": 997362, "d": 2330762, "e": 1952162, "f": 625660, "g": 1940478, "h": 83259, "i": 1015263, "j": 28397, "k": 150610, "l": 214383, "m": 226678, "n": 356573, "o": 1367298, "p": 210677, "q": 9545, "r": 232943, "s": 1723024, "t": 4051859, "u": 306661, "v": 137961, "w": 115060, "x": 18770, "y": 191590, "z": 12659}, "o": {"a": 333611, "b": 507058, "c": 1030631, "d": 621611, "e": 154901, "f": 1509661, "g": 182750, "h": 53343, "i": 324137, "j": 11882, "k": 87654, "l": 804704, "m": 994391, "n": 4575413, "o": 219213, "p": 683677, "q": 4149, "r": 3464564, "s": 641767, "t": 1406855, "u": 1149771, "v": 410511, "w": 561073, "x": 56638, "y": 39476, "z": 8869}, "p": {"a": 1189585, "b": 42076, "c": 124961, "d": 126398, "e": 1242826, "f": 80409, "g": 17643, "h": 137438, "i": 266019, "j": 9649, "k": 35379, "l": 828131, "m": 106979, "n": 68507, "o": 908296, "p": 557551, "q": 4360, "r": 1408275, "s": 295854, "t": 588109, "u": 216552, "v": 162392, "w": 69550, "x": 9971, "y": 42770, "z": 2061}, "q": {"a": 5362, "b": 2396, "c": 3695, "d": 4903, "e": 2437, "f": 2492, "g": 1606, "h": 1937, "i": 6228, "j": 1178, "k": 1502, "l": 2222, "m": 5600, "n": 6138, "o": 16223, "p": 3612, "q": 2178, "r": 4314, "s": 7449, "t": 4457, "u": 523091, "v": 2218, "w": 1957, "x": 1602, "y": 1593, "z": 1147}, "r": {"a": 1884150, "b": 108467, "c": 469520, "d": 619333, "e": 4613547, "f": 928117, "g": 206040, "h": 70002, "i": 1924644, "j": 14216, "k": 336855, "l": 161805, "m": 782008, "n": 396973, "o": 2026650, "p": 279715, "q": 9201, "r": 543892, "s": 939303, "t": 1323019, "u": 276419, "v": 443167, "w": 154662, "x": 13149, "y": 462288, "z": 6346}, "s": {"a": 1571556, "b": 200500, "c": 850779, "d": 386238, "e": 3128653, "f": 376110, "g": 84160, "h": 508213, "i": 1940017, "j": 27903, "k": 82209, "l": 194818, "m": 459368, "n": 353397, "o": 1116229, "p": 919294, "q": 11941, "r": 342864, "s": 1854672, "t": 3581542, "u": 824325, "v": 78034, "w": 307467, "x": 24837, "y": 215803, "z": 8336}, "t": {"a": 2239522, "b": 240284, "c": 445118, "d": 188710, "e": 3539356, "f": 299799, "g": 48582, "h": 6502699, "i": 4442428, "j": 13452, "k": 25898, "l": 277765, "m": 287203, "n": 183293, "o": 2396902, "p": 414605, "q": 8754, "r": 1552404, "s": 1302370, "t": 1061425, "u": 546833, "v": 47746, "w": 473638, "x": 30926, "y": 739526, "z": 16025}, "u": {"a": 254536, "b": 217707, "c": 270253, "d": 169700, "e": 638912, "f": 36218, "g": 117660, "h": 6047, "i": 258834, "j": 3202, "k": 8410, "l": 739209, "m": 462935, "n": 660975, "o": 17845, "p": 395636, "q": 1456, "r": 975780, "s": 1272647, "t": 937723, "u": 9229, "v": 4979, "w": 4665, "x": 7779, "y": 2786, "z": 3638}, "v": {"a": 553164, "b": 9641, "c": 37594, "d": 16968, "e": 1438129, "f": 10523, "g": 4867, "h": 8538, "i": 626442, "j": 5603, "k": 2505, "l": 16039, "m": 21836, "n": 13803, "o": 81911, "p": 80950, "q": 2070, "r": 27617, "s": 33989, "t": 22994, "u": 9415, "v": 8120, "w": 6624, "x": 3053, "y": 3050, "z": 1843}, "w": {"a": 328181, "b": 12472, "c": 24164, "d": 16960, "e": 296182, "f": 14279, "g": 12677, "h": 420451, "i": 733532, "j": 2434, "k": 6230, "l": 49180, "m": 17167, "n": 100428, "o": 450177, "p": 19437, "q": 1897, "r": 77674, "s": 115640, "t": 62041, "u": 7219, "v": 6072, "w": 98652, "x": 2572, "y": 4494, "z": 1484}, "x": {"a": 195222, "b": 15941, "c": 99442, "d": 23958, "e": 57299, "f": 23622, "g": 4359, "h": 9176, "i": 121876, "j": 3221, "k": 2420, "l": 12061, "m": 60813, "n": 10788, "o": 24409, "p": 147454, "q": 1549, "r": 18376, "s": 56593, "t": 281697, "u": 8440, "v": 6979, "w": 6973, "x": 34004, "y": 28368, "z": 1965}, "y": {"a": 299426, "b": 136713, "c": 149196, "d": 90086, "e": 172967, "f": 77444, "g": 22164, "h": 43433, "i": 236429, "j": 8732, "k": 13359, "l": 81444, "m": 131819, "n": 169093, "o": 195258, "p": 444042, "q": 5482, "r": 140869, "s": 328005, "t": 375785, "u": 48208, "v": 23601, "w": 81979, "x": 4858, "y": 8417, "z": 7340}, "z": {"a": 76890, "b": 2486, "c": 3088, "d": 2886, "e": 144164, "f": 1805, "g": 1705, "h": 9255, "i": 15379, "j": 1740, "k": 1506, "l": 3047, "m": 2930, "n": 2134, "o": 21144, "p": 2423, "q": 1128, "r": 7939, "s": 5601, "t": 3439, "u": 2997, "v": 2471, "w": 2121, "x": 1812, "y": 2670, "z": 4454}}} diff --git a/detect_secrets/filters/util.py b/detect_secrets/filters/util.py index 9acdb84e8..54455c81e 100644 --- a/detect_secrets/filters/util.py +++ b/detect_secrets/filters/util.py @@ -1,3 +1,4 @@ +import hashlib import inspect @@ -23,3 +24,27 @@ def get_caller_path(offset: int = 0) -> str: module_path = frame_info.frame.f_globals['__name__'] function_name = frame_info.function return f'{module_path}.{function_name}' + + +def compute_file_hash(filename: str, buffer_size: int = 64 * 1024) -> str: + """ + When we make any modifications to the inputs of a baseline, the baseline should + also reflect these changes. Otherwise, we can get into a strange situation of + irreproducibility: a "hidden" change to the underlying model may produce different + results. + + To ensure this doesn't happen, we capture a hash of the model that we are using. + This way, if the model changes, the baseline will have to change, and we will be + able to better track the changes this way. + + This is akin to: + $ sha1sum + """ + sha1 = hashlib.sha1() + with open(filename, 'rb') as f: + data = f.read(buffer_size) + while data: + sha1.update(data) + data = f.read(buffer_size) + + return sha1.hexdigest() diff --git a/detect_secrets/filters/wordlist.py b/detect_secrets/filters/wordlist.py index 4c1968fd2..690fa0325 100644 --- a/detect_secrets/filters/wordlist.py +++ b/detect_secrets/filters/wordlist.py @@ -4,11 +4,11 @@ will result in false positives. This filter efficiently processes this through the use of the Aho-Corasick algorithm. """ -import hashlib from functools import lru_cache from typing import Any from ..settings import get_settings +from .util import compute_file_hash Automaton = Any @@ -50,7 +50,7 @@ def initialize(wordlist_filename: str, min_length: int = 3, file_hash: str = '') get_settings().filters[path] = { 'min_length': min_length, 'file_name': wordlist_filename, - 'file_hash': _compute_wordlist_hash(wordlist_filename), + 'file_hash': compute_file_hash(wordlist_filename), } automaton.make_automaton() @@ -70,21 +70,3 @@ def should_exclude_secret(secret: str) -> bool: def get_automaton() -> Automaton: import ahocorasick return ahocorasick.Automaton() - - -def _compute_wordlist_hash(filename: str, buffer_size: int = 64 * 1024) -> str: - """ - We compute the hash based on the file contents, rather than the filename itself, since we - want to know if the underlying contents of the file changes. - - This is akin to: - $ sha1sum - """ - sha1 = hashlib.sha1() - with open(filename, 'rb') as f: - data = f.read(buffer_size) - while data: - sha1.update(data) - data = f.read(buffer_size) - - return sha1.hexdigest() diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py index dbf0e0e72..f67c84b0d 100644 --- a/detect_secrets/plugins/keyword.py +++ b/detect_secrets/plugins/keyword.py @@ -50,101 +50,7 @@ 'secret', 'secrete', ) -''' -Deprecated false positives list. This will be migrated soon. -FALSE_POSITIVES = { - '""', - '""):', - '"\'', - '")', - '"dummy', - '"replace', - '"this', - '#pass', - '#password', - '$(shell', - "'\"", - "''", - "''):", - "')", - "'dummy", - "'replace", - "'this", - '(nsstring', - '-default}', - '::', - '<%=', - '', - '', - '', - '', - '=', - '\\"$(shell', - '\\k.*"', - "\\k.*'", - '`cat', - '`grep', - '`sudo', - 'account_password', - 'api_key', - 'disable', - 'dummy_secret', - 'dummy_value', - 'false', - 'false):', - 'false,', - 'false;', - 'login_password', - 'none', - 'none,', - 'none}', - 'nopasswd', - 'not', - 'not_real_key', - 'null', - 'null,', - 'null.*"', - "null.*'", - 'null;', - 'pass', - 'pass)', - 'password', - 'password)', - 'password))', - 'password,', - 'password},', - 'prompt', - 'redacted', - 'secret', - 'some_key', - 'str', - 'str_to_sign', - 'string', - 'string)', - 'string,', - 'string;', - 'string?', - 'string?)', - 'string}', - 'string}}', - 'test', - 'test-access-key', - 'thisisnottherealsecret', - 'todo', - 'true', - 'true):', - 'true,', - 'true;', - 'undef', - 'undef,', - '{', - '{{', -} -''' + # Includes ], ', " as closing CLOSING = r'[]\'"]{0,2}' DENYLIST_REGEX = r'|'.join(DENYLIST) diff --git a/detect_secrets/settings.py b/detect_secrets/settings.py index 17ed6ef2d..5a4664780 100644 --- a/detect_secrets/settings.py +++ b/detect_secrets/settings.py @@ -43,6 +43,15 @@ def configure_settings_from_baseline(baseline: Dict[str, Any], filename: str = ' file_hash=config['file_hash'], ) + if 'detect_secrets.filters.gibberish.should_exclude_secret' in settings.filters: + config = settings.filters['detect_secrets.filters.gibberish.should_exclude_secret'] + + from detect_secrets import filters + filters.gibberish.initialize( + model_path=config.get('model'), + limit=config['limit'], + ) + if filename: settings.filters['detect_secrets.filters.common.is_baseline_file'] = { 'filename': filename, @@ -129,6 +138,7 @@ def configure_plugins(self, config: List[Dict[str, Any]]) -> 'Settings': name = plugin.pop('name') self.plugins[name] = plugin + get_plugins.cache_clear() return self def disable_plugins(self, *plugin_names: str) -> 'Settings': @@ -138,6 +148,7 @@ def disable_plugins(self, *plugin_names: str) -> 'Settings': except KeyError: pass + get_plugins.cache_clear() return self def configure_filters(self, config: List[Dict[str, Any]]) -> 'Settings': @@ -162,12 +173,14 @@ def configure_filters(self, config: List[Dict[str, Any]]) -> 'Settings': path = filter_config['path'] self.filters[path] = filter_config + get_filters.cache_clear() return self def disable_filters(self, *filter_paths: str) -> 'Settings': for filter_path in filter_paths: self.filters.pop(filter_path, None) + get_filters.cache_clear() return self def json(self) -> Dict[str, Any]: diff --git a/docs/filters.md b/docs/filters.md index f0e721457..0160c1aa4 100644 --- a/docs/filters.md +++ b/docs/filters.md @@ -46,6 +46,7 @@ the `detect_secrets.filters` namespace. | `common.is_invalid_file` | Ignores files that are not files (e.g. links). | | `common.is_baseline_file` | Ignores the baseline file itself. | | `common.is_ignored_due_to_verification_policies` | Powers secret verification functionality. | +| `gibberish.should_exclude_secret` | Excludes secrets that are not gibberish looking strings. | | `heuristic.is_indirect_reference` | Primarily for `KeywordDetector`, filters secrets like `secret = get_secret_key()`. | | `heuristic.is_likely_id_string` | Ignores secret values prefixed with `id`. | | `heuristic.is_non_text_file` | Ignores non-text files (e.g. archives, images). | diff --git a/requirements-dev-minimal.txt b/requirements-dev-minimal.txt index 66f45e063..4d2fb9892 100644 --- a/requirements-dev-minimal.txt +++ b/requirements-dev-minimal.txt @@ -2,6 +2,7 @@ # on python 3.6.0 (xenial) coverage<5 flake8==3.5.0 +gibberish-detector>0.1.1 monotonic mypy pre-commit diff --git a/requirements-dev.txt b/requirements-dev.txt index 20b8fd98d..5853095ee 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,6 +7,7 @@ coverage==4.5.4 distlib==0.3.1 filelock==3.0.12 flake8==3.5.0 +gibberish-detector==0.1.1 identify==1.5.10 idna==2.10 importlib-metadata==2.1.1 diff --git a/setup.py b/setup.py index e19b75fe9..49f16f820 100644 --- a/setup.py +++ b/setup.py @@ -34,10 +34,14 @@ def get_version(): 'pyyaml', 'requests', ], + include_package_data=True, extras_require={ 'word_list': [ 'pyahocorasick', ], + 'gibberish': [ + 'gibberish-detector', + ], }, entry_points={ 'console_scripts': [ diff --git a/testing/mocks.py b/testing/mocks.py index f1ec173fc..120c3b87d 100644 --- a/testing/mocks.py +++ b/testing/mocks.py @@ -71,3 +71,19 @@ def debug(self, message: str, *args: Any) -> None: @property def debug_messages(self) -> str: # pragma: no cover return self.messages['debug'] + + +@contextmanager +def disable_gibberish_filter(): + """ + Unfortunately, we can't just use `Settings.disable_filters`, since `parse_args` is + the function that *enables* this filter. Therefore, for test cases that test through + the `main` function flow, we can't disable the filter before the function call. + + However, since this only happens in test environments, we can just mock it out. + """ + with mock.patch( + 'detect_secrets.filters.gibberish.is_feature_enabled', + return_value=False, + ): + yield diff --git a/tests/filters/common_filter_test.py b/tests/filters/common_filter_test.py index 81cd8e9a4..be6bc6032 100644 --- a/tests/filters/common_filter_test.py +++ b/tests/filters/common_filter_test.py @@ -14,7 +14,7 @@ class TestVerify: @staticmethod def test_does_not_verify_if_no_verify(): with register_plugin(MockPlugin(should_verify=False)): - main_module.main(['scan', '--string', 'fake-secret', '--no-verify']) + main_module.main(['scan', '--string', 'deadbeef', '--no-verify']) @staticmethod @pytest.mark.parametrize( @@ -30,7 +30,7 @@ def test_adheres_to_verification_policies(args, verified_result, should_be_prese with register_plugin( MockPlugin(verified_result=verified_result), ), mock_printer(main_module) as printer: - main_module.main(['scan', '--string', 'fake-secret', *args]) + main_module.main(['scan', '--string', 'deadbeef', *args]) for line in printer.message.splitlines(): plugin_name, result = [x.strip() for x in line.split(':')] @@ -54,7 +54,8 @@ def test_handles_request_error_gracefully(): class MockPlugin(RegexBasedDetector): denylist = ( - re.compile('fake-secret'), + # We use a hex string here, due to the gibberish detector. + re.compile('deadbeef'), ) secret_type = 'mock plugin' diff --git a/tests/filters/gibberish_filter_test.py b/tests/filters/gibberish_filter_test.py new file mode 100644 index 000000000..a526f8ad1 --- /dev/null +++ b/tests/filters/gibberish_filter_test.py @@ -0,0 +1,61 @@ +import os + +import pytest + +from detect_secrets import filters +from detect_secrets.plugins.private_key import PrivateKeyDetector +from detect_secrets.settings import transient_settings + + +class TestShouldExcludeSecret: + @staticmethod + @pytest.fixture(autouse=True) + def initialize(): + filters.gibberish.initialize( + model_path=os.path.join(filters.gibberish.__path__[0], 'rfc.model'), + ) + + try: + yield + finally: + filters.gibberish.get_model.cache_clear() + + @staticmethod + @pytest.mark.parametrize( + 'secret', + ( + 'this-is-a-bad-password', + + # URLs (which have been traditionally picked up by Base64HighEntropyString) + # are now excluded! + '/biz_user/NCIygBmcWTENrE1n06oprA/business_ids/v1', + + # same thing with long strings + 'k8s-KUBE_CLUSTER-ca/issue/k8s-prometheus-adapter', + ), + ) + def test_success(secret): + assert filters.gibberish.should_exclude_secret(secret) + + @staticmethod + def test_ignores_hex_strings(): + assert not filters.gibberish.should_exclude_secret('2b00042f7481c7b056c4b410d28f33cf') + + @staticmethod + def test_does_not_affect_private_keys(): + assert not filters.gibberish.should_exclude_secret( + 'BEGIN PRIVATE KEY', + plugin=PrivateKeyDetector(), + ) + + +def test_load_from_baseline(): + with transient_settings({ + 'filters_used': [{ + 'path': 'detect_secrets.filters.gibberish.should_exclude_secret', + 'model': os.path.join(filters.gibberish.__path__[0], 'rfc.model'), + 'file_hash': '00b672f709e9bf51fe2e09abe247ac3b6415d645', + 'limit': 3.7, + }], + }): + assert filters.gibberish.should_exclude_secret('clearly-not-a-secret') diff --git a/tests/main_test.py b/tests/main_test.py index 928cc1aab..187b401a2 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -9,6 +9,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.main import scan_adhoc_string from detect_secrets.settings import transient_settings +from testing.mocks import disable_gibberish_filter from testing.mocks import mock_printer @@ -130,7 +131,7 @@ def test_supports_stdin(): ], }), mock_stdin( 'AKIATESTTESTTESTTEST', - ), mock_printer(main_module) as printer: + ), mock_printer(main_module) as printer, disable_gibberish_filter(): assert main_module.main(['scan', '--string']) == 0 assert printer.message.strip() == 'AWSKeyDetector: True (unverified)' diff --git a/tests/pre_commit_hook_test.py b/tests/pre_commit_hook_test.py index 05070995a..4c44fd7fe 100644 --- a/tests/pre_commit_hook_test.py +++ b/tests/pre_commit_hook_test.py @@ -11,6 +11,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.pre_commit_hook import main from detect_secrets.settings import transient_settings +from testing.mocks import disable_gibberish_filter @pytest.fixture(autouse=True) @@ -50,31 +51,33 @@ def test_baseline_filters_out_known_secrets(): secrets = SecretsCollection() secrets.scan_file('test_data/each_secret.py') - with tempfile.NamedTemporaryFile() as f: - baseline.save_to_file(secrets, f.name) - f.seek(0) + assert secrets - # This succeeds, because all the secrets are known. - assert_commit_succeeds([ - 'test_data/each_secret.py', - '--baseline', - f.name, - ]) + with disable_gibberish_filter(): + with tempfile.NamedTemporaryFile() as f: + baseline.save_to_file(secrets, f.name) + f.seek(0) - # Remove one arbitrary secret, so that it won't be the full set. - secrets.data['test_data/each_secret.py'].pop() + # This succeeds, because all the secrets are known. + assert_commit_succeeds([ + 'test_data/each_secret.py', + '--baseline', + f.name, + ]) - with tempfile.NamedTemporaryFile() as f: - baseline.save_to_file(secrets, f.name) - f.seek(0) + # Remove one arbitrary secret, so that it won't be the full set. + secrets.data['test_data/each_secret.py'].pop() - # Test that it isn't the case that a baseline is provided, and everything passes. - # import pdb; pdb.set_trace() - assert_commit_blocked([ - 'test_data/each_secret.py', - '--baseline', - f.name, - ]) + with tempfile.NamedTemporaryFile() as f: + baseline.save_to_file(secrets, f.name) + f.seek(0) + + # Test that it isn't the case that a baseline is provided, and everything passes. + assert_commit_blocked([ + 'test_data/each_secret.py', + '--baseline', + f.name, + ]) class TestModifiesBaselineFromVersionChange: