From 2a063928161298cb5db8792de327c8c9d892ae92 Mon Sep 17 00:00:00 2001
From: Aaron Loo <aaronloo@yelp.com>
Date: Thu, 4 Mar 2021 07:53:03 -0800
Subject: [PATCH] adding gibberish detector

---
 .pre-commit-config.yaml                      |  6 +-
 .secrets.baseline                            | 56 ++----------
 MANIFEST.in                                  |  1 +
 README.md                                    | 65 ++++++++++---
 detect_secrets/core/baseline.py              |  3 +
 detect_secrets/core/usage/filters.py         | 23 +++++
 detect_secrets/filters/__init__.py           |  1 +
 detect_secrets/filters/gibberish/__init__.py | 93 +++++++++++++++++++
 detect_secrets/filters/gibberish/rfc.model   |  1 +
 detect_secrets/filters/util.py               | 25 +++++
 detect_secrets/filters/wordlist.py           | 22 +----
 detect_secrets/plugins/keyword.py            | 96 +-------------------
 detect_secrets/settings.py                   | 13 +++
 docs/filters.md                              |  1 +
 requirements-dev-minimal.txt                 |  1 +
 requirements-dev.txt                         |  1 +
 setup.py                                     |  4 +
 testing/mocks.py                             | 16 ++++
 tests/filters/common_filter_test.py          |  7 +-
 tests/filters/gibberish_filter_test.py       | 61 +++++++++++++
 tests/main_test.py                           |  3 +-
 tests/pre_commit_hook_test.py                | 45 ++++-----
 22 files changed, 338 insertions(+), 206 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 100644 detect_secrets/filters/gibberish/__init__.py
 create mode 100644 detect_secrets/filters/gibberish/rfc.model
 create mode 100644 tests/filters/gibberish_filter_test.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b117bc416..87078a5de 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,8 +29,10 @@ repos:
     rev: v1.4.4
     hooks:
     -   id: autopep8
--   repo: https://github.com/Yelp/detect-secrets
-    rev: v1.0.1
+-   repo: local
     hooks:
     -   id: detect-secrets
+        name: Detect secrets
+        language: python
+        entry: detect-secrets-hook
         args: ['--baseline', '.secrets.baseline']
diff --git a/.secrets.baseline b/.secrets.baseline
index 60ba42835..28f332244 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -1,5 +1,5 @@
 {
-  "version": "1.0.1",
+  "version": "1.0.3",
   "plugins_used": [
     {
       "name": "ArtifactoryDetector"
@@ -74,6 +74,10 @@
       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
       "min_level": 2
     },
+    {
+      "path": "detect_secrets.filters.gibberish.should_exclude_secret",
+      "limit": 3.7
+    },
     {
       "path": "detect_secrets.filters.heuristic.is_indirect_reference"
     },
@@ -100,38 +104,6 @@
     }
   ],
   "results": {
-    "README.md": [
-      {
-        "type": "Secret Keyword",
-        "filename": "README.md",
-        "hashed_secret": "25d176b9bc8c2a063e8319e044bd127b49a15755",
-        "is_verified": false,
-        "line_number": 483
-      }
-    ],
-    "detect_secrets/plugins/keyword.py": [
-      {
-        "type": "Secret Keyword",
-        "filename": "detect_secrets/plugins/keyword.py",
-        "hashed_secret": "0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33",
-        "is_verified": false,
-        "line_number": 178
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "detect_secrets/plugins/keyword.py",
-        "hashed_secret": "62cdb7020ff920e5aa642c3d4066950dd1f01f4d",
-        "is_verified": false,
-        "line_number": 189
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "detect_secrets/plugins/keyword.py",
-        "hashed_secret": "1af17e73721dbe0c40011b82ed4bb1a7dbe3ce29",
-        "is_verified": false,
-        "line_number": 223
-      }
-    ],
     "detect_secrets/plugins/private_key.py": [
       {
         "type": "Private Key",
@@ -230,13 +202,6 @@
         "is_verified": false,
         "line_number": 53
       },
-      {
-        "type": "Secret Keyword",
-        "filename": "docs/design.md",
-        "hashed_secret": "fc782b0875be9e076d80f5da1430d6ea501c87e5",
-        "is_verified": false,
-        "line_number": 54
-      },
       {
         "type": "Base64 High Entropy String",
         "filename": "docs/design.md",
@@ -244,16 +209,7 @@
         "is_verified": false,
         "line_number": 200
       }
-    ],
-    "docs/filters.md": [
-      {
-        "type": "Secret Keyword",
-        "filename": "docs/filters.md",
-        "hashed_secret": "4566d0493d8a9b7a811728e852ed5df95fa70dd2",
-        "is_verified": false,
-        "line_number": 55
-      }
     ]
   },
-  "generated_at": "2021-02-25T19:11:59Z"
+  "generated_at": "2021-03-04T15:43:23Z"
 }
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..833b8ca2c
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include detect_secrets/filters/gibberish/rfc.model
diff --git a/README.md b/README.md
index 60290ffa5..cc2e9c615 100644
--- a/README.md
+++ b/README.md
@@ -356,9 +356,6 @@ filter options:
                         If filenames match this regex, it will be ignored.
   --exclude-secrets EXCLUDE_SECRETS
                         If secrets match this regex, it will be ignored.
-  --word-list WORD_LIST_FILE
-                        Text file with a list of words, if a secret contains a
-                        word in the list we ignore it.
   -f FILTER, --filter FILTER
                         Specify path to custom filter. May be a python module
                         path (e.g.
@@ -513,17 +510,6 @@ Or you can specify multiple regex rules as such:
 $ detect-secrets scan --exclude-secrets 'fakesecret' --exclude-secrets '\${.*})'
 ```
 
-#### --word-list
-
-If you know there are certain fake password values that you want to ignore, you can also use
-this option:
-
-```bash
-$ cat wordlist.txt
-not-a-real-secret
-$ detect-secrets scan --word-list wordlist.txt
-```
-
 #### Inline Allowlisting
 
 Sometimes, you want to apply an exclusion to a specific line, rather than globally excluding it.
@@ -556,6 +542,57 @@ $ detect-secrets scan --only-allowlisted
 Want to write more custom logic to filter out false positives? Check out how to do this in
 our [filters documentation](docs/filters.md#Using-Your-Own-Filters).
 
+## Extensions
+
+### wordlist
+
+The `--exclude-secrets` flag allows you to specify regex rules to exclude secret values. However,
+if you want to specify a large list of words instead, you can use the `--word-list` flag.
+
+To use this feature, be sure to install the `pyahocorasick` package, or simply use:
+
+```bash
+$ pip install detect-secrets[word_list]
+```
+
+Then, you can use it as such:
+
+```bash
+$ cat wordlist.txt
+not-a-real-secret
+$ cat sample.ini
+password = not-a-real-secret
+
+# Will show results
+$ detect-secrets scan sample.ini
+
+# No results found
+$ detect-secrets scan --word-list wordlist.txt
+```
+
+### Gibberish Detector
+
+The Gibberish Detector is a simple ML model, that attempts to determine whether a secret value
+is actually gibberish, with the assumption that **real** secret values are not word-like.
+
+To use this feature, be sure to install the `gibberish-detector` package, or use:
+
+```bash
+$ pip install detect-secrets[gibberish]
+```
+
+Check out the [gibberish-detector](https://github.com/domanchi/gibberish-detector) package for
+more information on how to train the model. A pre-trained model (seeded by processing RFCs) will
+be included for easy use.
+
+You can also specify your own model as such:
+
+```bash
+$ detect-secrets scan --gibberish-model custom.model
+```
+
+This is not a default plugin, given that this will ignore secrets such as `password`.
+
 ## Caveats
 
 This is not meant to be a sure-fire solution to prevent secrets from entering the codebase. Only
diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py
index f8afeb1a0..e4018d7b6 100644
--- a/detect_secrets/core/baseline.py
+++ b/detect_secrets/core/baseline.py
@@ -90,6 +90,9 @@ def save_to_file(
         If you're trying to decide the difference, ask yourself whether there are any changes
         that does not directly impact the results of the scan.
     """
+    # TODO: I wonder whether this should add the `detect_secrets.filters.common.is_baseline_file`
+    # filter, since we know the filename already. However, one could argue that it would cause
+    # this function to "do more than one thing".
     output = secrets
     if isinstance(secrets, SecretsCollection):
         output = format_for_output(secrets)
diff --git a/detect_secrets/core/usage/filters.py b/detect_secrets/core/usage/filters.py
index 82c8e9809..c27ce0843 100644
--- a/detect_secrets/core/usage/filters.py
+++ b/detect_secrets/core/usage/filters.py
@@ -65,6 +65,19 @@ def add_filter_options(parent: argparse.ArgumentParser) -> None:
             dest='word_list_file',
         )
 
+    if filters.gibberish.is_feature_enabled():
+        parser.add_argument(
+            '--gibberish-model',
+            type=valid_path,
+            help='Path to model trained with gibberish-detector.',
+            dest='gibberish_model_file',
+        )
+        parser.add_argument(
+            '--gibberish-limit',
+            type=float,
+            help='Threshold to determine whether a string is gibberish.',
+        )
+
     _add_custom_filters(parser)
     _add_disable_flag(parser)
 
@@ -145,6 +158,16 @@ def parse_args(args: argparse.Namespace) -> None:
     ):
         filters.wordlist.initialize(args.word_list_file)
 
+    if filters.gibberish.is_feature_enabled():
+        kwargs = {}
+        if args.gibberish_model_file:
+            kwargs['model_path'] = args.gibberish_model_file
+
+        if args.gibberish_limit:
+            kwargs['limit'] = args.gibberish_limit
+
+        filters.gibberish.initialize(**kwargs)
+
     if not args.no_verify:
         get_settings().filters[
             'detect_secrets.filters.common.is_ignored_due_to_verification_policies'
diff --git a/detect_secrets/filters/__init__.py b/detect_secrets/filters/__init__.py
index e44d782c1..bda705e98 100644
--- a/detect_secrets/filters/__init__.py
+++ b/detect_secrets/filters/__init__.py
@@ -1,4 +1,5 @@
 from . import allowlist     # noqa: F401
+from . import gibberish     # noqa: F401
 from . import heuristic     # noqa: F401
 from . import regex         # noqa: F401
 from . import wordlist      # noqa: F401
diff --git a/detect_secrets/filters/gibberish/__init__.py b/detect_secrets/filters/gibberish/__init__.py
new file mode 100644
index 000000000..17384ea5a
--- /dev/null
+++ b/detect_secrets/filters/gibberish/__init__.py
@@ -0,0 +1,93 @@
+import os
+import string
+from functools import lru_cache
+from typing import Any
+from typing import Optional
+
+from ...core.plugins import Plugin
+from ...plugins.private_key import PrivateKeyDetector
+from ...settings import get_settings
+from ..util import compute_file_hash
+
+
+Model = Any
+
+
+def is_feature_enabled() -> bool:
+    try:
+        get_model()
+        return True
+    except ImportError:
+        return False
+
+
+def initialize(model_path: Optional[str] = None, limit: float = 3.7) -> None:
+    """
+    :param limit: this limit was obtained through trial and error. Check out
+        the original pull request for rationale.
+
+    :raises: ValueError
+    """
+    path = model_path
+    if not path:
+        path = os.path.join(__path__[0], 'rfc.model')
+
+    model = get_model()
+
+    from gibberish_detector import serializer
+    from gibberish_detector.exceptions import ParsingError
+    with open(path) as f:
+        try:
+            model.update(serializer.deserialize(f.read()))
+        except ParsingError:
+            raise ValueError('Invalid model.')
+
+    config = {
+        'limit': limit,
+    }
+    if model_path:
+        config['model'] = model_path
+        config['file_hash'] = compute_file_hash(model_path)
+
+    path = f'{__name__}.should_exclude_secret'
+    get_settings().filters[path] = config
+
+
+def should_exclude_secret(secret: str, plugin: Optional[Plugin] = None) -> bool:
+    """
+    :param plugin: optional, for easier testing. The dependency injection system
+        will populate its proper value on complete runs.
+    """
+    # Private keys are actual words, so they will be a false negative.
+    if isinstance(plugin, PrivateKeyDetector):
+        return False
+
+    # Through real-life experimentation, we discovered that the gibberish detector
+    # works best with non-hex strings, since hex strings have a too limited charset
+    # to fit our trained models. As such, we cannot make a deterministic decision
+    # in such cases.
+    if not (set(secret) - set(string.hexdigits + '-')):
+        return False
+
+    if not get_model().data or not get_model().charset:
+        raise AssertionError('Attempting to use uninitialized gibberish model.')
+
+    from gibberish_detector.detector import Detector
+    detector = Detector(
+        model=get_model(),
+        threshold=get_settings().filters[f'{__name__}.should_exclude_secret']['limit'],
+    )
+
+    # TODO: secret.lower() is only used currently, since the default model is only
+    # trained with lower case letters. However, in the future, if people want to train
+    # a model that is case-sensitive, we can figure out how to change this.
+    # Unfortunately, it's not straight-forward to just remove the `.lower()` function call,
+    # since if the string is *not* lowered (and the model expects it to be), the results
+    # will be quite different.
+    return not detector.is_gibberish(secret.lower())
+
+
+@lru_cache(maxsize=1)
+def get_model() -> 'Model':
+    from gibberish_detector.model import Model
+    return Model(charset='')
diff --git a/detect_secrets/filters/gibberish/rfc.model b/detect_secrets/filters/gibberish/rfc.model
new file mode 100644
index 000000000..0676ed2a1
--- /dev/null
+++ b/detect_secrets/filters/gibberish/rfc.model
@@ -0,0 +1 @@
+{"charset": "abcdefghijklmnopqrstuvwxyz", "ngram_size": 2, "counts": {"a": {"a": 118635, "b": 630875, "c": 1501150, "d": 938413, "e": 64942, "f": 203534, "g": 892717, "h": 50302, "i": 629589, "j": 16393, "k": 122527, "l": 2134199, "m": 865412, "n": 3466183, "o": 39385, "p": 708687, "q": 15378, "r": 1990681, "s": 1639856, "t": 4062626, "u": 390482, "v": 290066, "w": 65833, "x": 148257, "y": 497074, "z": 9679}, "b": {"a": 299747, "b": 38842, "c": 66484, "d": 26575, "e": 1340130, "f": 29026, "g": 32212, "h": 8693, "i": 355357, "j": 188222, "k": 6350, "l": 528022, "m": 42838, "n": 36585, "o": 271797, "p": 20216, "q": 1650, "r": 143544, "s": 134924, "t": 55332, "u": 282480, "v": 8513, "w": 9195, "x": 4962, "y": 350774, "z": 1991}, "c": {"a": 1583493, "b": 55072, "c": 366166, "d": 99397, "e": 1680049, "f": 61790, "g": 22751, "h": 1138161, "i": 647914, "j": 23415, "k": 608337, "l": 409128, "m": 129894, "n": 63692, "o": 2266996, "p": 202574, "q": 8192, "r": 525914, "s": 234381, "t": 1400393, "u": 582740, "v": 25213, "w": 27222, "x": 12164, "y": 94929, "z": 3613}, "d": {"a": 1102196, "b": 395535, "c": 205472, "d": 521902, "e": 2349496, "f": 230444, "g": 90095, "h": 107584, "i": 1575337, "j": 30840, "k": 23781, "l": 151769, "m": 188851, "n": 183549, "o": 773165, "p": 252828, "q": 10114, "r": 490139, "s": 679426, "t": 677199, "u": 321170, "v": 101547, "w": 168753, "x": 13779, "y": 59907, "z": 4936}, "e": {"a": 1743853, "b": 293141, "c": 2504452, "d": 3070491, "e": 835273, "f": 950803, "g": 431308, "h": 176469, "i": 1177449, "j": 38230, "k": 70232, "l": 1227191, "m": 1322498, "n": 4007948, "o": 705920, "p": 939044, "q": 450185, "r": 5234236, "s": 4637943, "t": 2734401, "u": 284887, "v": 537744, "w": 395249, "x": 721758, "y": 247129, "z": 15923}, "f": {"a": 482902, "b": 46301, "c": 776635, "d": 74453, "e": 459377, "f": 324747, "g": 15818, "h": 24876, "i": 1326013, "j": 5135, "k": 7786, "l": 154253, "m": 68036, "n": 47090, "o": 1525315, "p": 71256, "q": 7352, "r": 399118, "s": 158690, "t": 743932, "u": 192428, "v": 25544, "w": 23974, "x": 11395, "y": 80258, "z": 5178}, "g": {"a": 374124, "b": 43000, "c": 75143, "d": 54891, "e": 1210541, "f": 61951, "g": 89766, "h": 265348, "i": 434656, "j": 9650, "k": 8369, "l": 111072, "m": 141157, "n": 262172, "o": 233776, "p": 126045, "q": 3985, "r": 368752, "s": 199803, "t": 394592, "u": 242465, "v": 16518, "w": 40581, "x": 7464, "y": 46646, "z": 4033}, "h": {"a": 1557099, "b": 48781, "c": 89817, "d": 47268, "e": 4944439, "f": 32193, "g": 12079, "h": 20750, "i": 1105135, "j": 3953, "k": 9651, "l": 32170, "m": 125637, "n": 76152, "o": 775525, "p": 47687, "q": 3047, "r": 145761, "s": 83710, "t": 413869, "u": 73213, "v": 16513, "w": 22152, "x": 4223, "y": 37868, "z": 6960}, "i": {"a": 545417, "b": 441765, "c": 1649054, "d": 823145, "e": 1026129, "f": 883435, "g": 624619, "h": 16297, "i": 47452, "j": 11907, "k": 56496, "l": 788318, "m": 672818, "n": 5188085, "o": 2696322, "p": 641038, "q": 31721, "r": 507827, "s": 2474217, "t": 2145858, "u": 27798, "v": 521907, "w": 8937, "x": 103556, "y": 3155, "z": 178043}, "j": {"a": 65309, "b": 5150, "c": 4630, "d": 3609, "e": 207116, "f": 2967, "g": 2140, "h": 3674, "i": 12565, "j": 1966, "k": 4624, "l": 3278, "m": 6958, "n": 5271, "o": 59640, "p": 7497, "q": 1402, "r": 4799, "s": 15013, "t": 4209, "u": 80334, "v": 2082, "w": 6849, "x": 1481, "y": 1382, "z": 1520}, "k": {"a": 91393, "b": 24180, "c": 35534, "d": 25923, "e": 557437, "f": 32487, "g": 8856, "h": 16581, "i": 173899, "j": 5028, "k": 9027, "l": 40471, "m": 36600, "n": 89335, "o": 53389, "p": 133808, "q": 2485, "r": 28645, "s": 177604, "t": 62173, "u": 27442, "v": 5748, "w": 27477, "x": 2830, "y": 6013, "z": 1805}, "l": {"a": 929370, "b": 103217, "c": 139662, "d": 512423, "e": 1824337, "f": 113308, "g": 78226, "h": 36656, "i": 1450931, "j": 14335, "k": 24164, "l": 983903, "m": 107923, "n": 110976, "o": 905778, "p": 211582, "q": 5416, "r": 114840, "s": 608836, "t": 453915, "u": 460820, "v": 96718, "w": 49654, "x": 6631, "y": 577831, "z": 11631}, "m": {"a": 1635769, "b": 287867, "c": 67799, "d": 50279, "e": 2204807, "f": 38281, "g": 16738, "h": 16864, "i": 672936, "j": 6422, "k": 6652, "l": 90860, "m": 268828, "n": 40050, "o": 527359, "p": 791104, "q": 3281, "r": 47377, "s": 279178, "t": 188486, "u": 454062, "v": 16237, "w": 25264, "x": 6360, "y": 9965, "z": 2639}, "n": {"a": 1538477, "b": 200065, "c": 997362, "d": 2330762, "e": 1952162, "f": 625660, "g": 1940478, "h": 83259, "i": 1015263, "j": 28397, "k": 150610, "l": 214383, "m": 226678, "n": 356573, "o": 1367298, "p": 210677, "q": 9545, "r": 232943, "s": 1723024, "t": 4051859, "u": 306661, "v": 137961, "w": 115060, "x": 18770, "y": 191590, "z": 12659}, "o": {"a": 333611, "b": 507058, "c": 1030631, "d": 621611, "e": 154901, "f": 1509661, "g": 182750, "h": 53343, "i": 324137, "j": 11882, "k": 87654, "l": 804704, "m": 994391, "n": 4575413, "o": 219213, "p": 683677, "q": 4149, "r": 3464564, "s": 641767, "t": 1406855, "u": 1149771, "v": 410511, "w": 561073, "x": 56638, "y": 39476, "z": 8869}, "p": {"a": 1189585, "b": 42076, "c": 124961, "d": 126398, "e": 1242826, "f": 80409, "g": 17643, "h": 137438, "i": 266019, "j": 9649, "k": 35379, "l": 828131, "m": 106979, "n": 68507, "o": 908296, "p": 557551, "q": 4360, "r": 1408275, "s": 295854, "t": 588109, "u": 216552, "v": 162392, "w": 69550, "x": 9971, "y": 42770, "z": 2061}, "q": {"a": 5362, "b": 2396, "c": 3695, "d": 4903, "e": 2437, "f": 2492, "g": 1606, "h": 1937, "i": 6228, "j": 1178, "k": 1502, "l": 2222, "m": 5600, "n": 6138, "o": 16223, "p": 3612, "q": 2178, "r": 4314, "s": 7449, "t": 4457, "u": 523091, "v": 2218, "w": 1957, "x": 1602, "y": 1593, "z": 1147}, "r": {"a": 1884150, "b": 108467, "c": 469520, "d": 619333, "e": 4613547, "f": 928117, "g": 206040, "h": 70002, "i": 1924644, "j": 14216, "k": 336855, "l": 161805, "m": 782008, "n": 396973, "o": 2026650, "p": 279715, "q": 9201, "r": 543892, "s": 939303, "t": 1323019, "u": 276419, "v": 443167, "w": 154662, "x": 13149, "y": 462288, "z": 6346}, "s": {"a": 1571556, "b": 200500, "c": 850779, "d": 386238, "e": 3128653, "f": 376110, "g": 84160, "h": 508213, "i": 1940017, "j": 27903, "k": 82209, "l": 194818, "m": 459368, "n": 353397, "o": 1116229, "p": 919294, "q": 11941, "r": 342864, "s": 1854672, "t": 3581542, "u": 824325, "v": 78034, "w": 307467, "x": 24837, "y": 215803, "z": 8336}, "t": {"a": 2239522, "b": 240284, "c": 445118, "d": 188710, "e": 3539356, "f": 299799, "g": 48582, "h": 6502699, "i": 4442428, "j": 13452, "k": 25898, "l": 277765, "m": 287203, "n": 183293, "o": 2396902, "p": 414605, "q": 8754, "r": 1552404, "s": 1302370, "t": 1061425, "u": 546833, "v": 47746, "w": 473638, "x": 30926, "y": 739526, "z": 16025}, "u": {"a": 254536, "b": 217707, "c": 270253, "d": 169700, "e": 638912, "f": 36218, "g": 117660, "h": 6047, "i": 258834, "j": 3202, "k": 8410, "l": 739209, "m": 462935, "n": 660975, "o": 17845, "p": 395636, "q": 1456, "r": 975780, "s": 1272647, "t": 937723, "u": 9229, "v": 4979, "w": 4665, "x": 7779, "y": 2786, "z": 3638}, "v": {"a": 553164, "b": 9641, "c": 37594, "d": 16968, "e": 1438129, "f": 10523, "g": 4867, "h": 8538, "i": 626442, "j": 5603, "k": 2505, "l": 16039, "m": 21836, "n": 13803, "o": 81911, "p": 80950, "q": 2070, "r": 27617, "s": 33989, "t": 22994, "u": 9415, "v": 8120, "w": 6624, "x": 3053, "y": 3050, "z": 1843}, "w": {"a": 328181, "b": 12472, "c": 24164, "d": 16960, "e": 296182, "f": 14279, "g": 12677, "h": 420451, "i": 733532, "j": 2434, "k": 6230, "l": 49180, "m": 17167, "n": 100428, "o": 450177, "p": 19437, "q": 1897, "r": 77674, "s": 115640, "t": 62041, "u": 7219, "v": 6072, "w": 98652, "x": 2572, "y": 4494, "z": 1484}, "x": {"a": 195222, "b": 15941, "c": 99442, "d": 23958, "e": 57299, "f": 23622, "g": 4359, "h": 9176, "i": 121876, "j": 3221, "k": 2420, "l": 12061, "m": 60813, "n": 10788, "o": 24409, "p": 147454, "q": 1549, "r": 18376, "s": 56593, "t": 281697, "u": 8440, "v": 6979, "w": 6973, "x": 34004, "y": 28368, "z": 1965}, "y": {"a": 299426, "b": 136713, "c": 149196, "d": 90086, "e": 172967, "f": 77444, "g": 22164, "h": 43433, "i": 236429, "j": 8732, "k": 13359, "l": 81444, "m": 131819, "n": 169093, "o": 195258, "p": 444042, "q": 5482, "r": 140869, "s": 328005, "t": 375785, "u": 48208, "v": 23601, "w": 81979, "x": 4858, "y": 8417, "z": 7340}, "z": {"a": 76890, "b": 2486, "c": 3088, "d": 2886, "e": 144164, "f": 1805, "g": 1705, "h": 9255, "i": 15379, "j": 1740, "k": 1506, "l": 3047, "m": 2930, "n": 2134, "o": 21144, "p": 2423, "q": 1128, "r": 7939, "s": 5601, "t": 3439, "u": 2997, "v": 2471, "w": 2121, "x": 1812, "y": 2670, "z": 4454}}}
diff --git a/detect_secrets/filters/util.py b/detect_secrets/filters/util.py
index 9acdb84e8..54455c81e 100644
--- a/detect_secrets/filters/util.py
+++ b/detect_secrets/filters/util.py
@@ -1,3 +1,4 @@
+import hashlib
 import inspect
 
 
@@ -23,3 +24,27 @@ def get_caller_path(offset: int = 0) -> str:
     module_path = frame_info.frame.f_globals['__name__']
     function_name = frame_info.function
     return f'{module_path}.{function_name}'
+
+
+def compute_file_hash(filename: str, buffer_size: int = 64 * 1024) -> str:
+    """
+    When we make any modifications to the inputs of a baseline, the baseline should
+    also reflect these changes. Otherwise, we can get into a strange situation of
+    irreproducibility: a "hidden" change to the underlying model may produce different
+    results.
+
+    To ensure this doesn't happen, we capture a hash of the model that we are using.
+    This way, if the model changes, the baseline will have to change, and we will be
+    able to better track the changes this way.
+
+    This is akin to:
+        $ sha1sum <filename>
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        data = f.read(buffer_size)
+        while data:
+            sha1.update(data)
+            data = f.read(buffer_size)
+
+    return sha1.hexdigest()
diff --git a/detect_secrets/filters/wordlist.py b/detect_secrets/filters/wordlist.py
index 4c1968fd2..690fa0325 100644
--- a/detect_secrets/filters/wordlist.py
+++ b/detect_secrets/filters/wordlist.py
@@ -4,11 +4,11 @@
 will result in false positives. This filter efficiently processes this through the
 use of the Aho-Corasick algorithm.
 """
-import hashlib
 from functools import lru_cache
 from typing import Any
 
 from ..settings import get_settings
+from .util import compute_file_hash
 
 
 Automaton = Any
@@ -50,7 +50,7 @@ def initialize(wordlist_filename: str, min_length: int = 3, file_hash: str = '')
     get_settings().filters[path] = {
         'min_length': min_length,
         'file_name': wordlist_filename,
-        'file_hash': _compute_wordlist_hash(wordlist_filename),
+        'file_hash': compute_file_hash(wordlist_filename),
     }
 
     automaton.make_automaton()
@@ -70,21 +70,3 @@ def should_exclude_secret(secret: str) -> bool:
 def get_automaton() -> Automaton:
     import ahocorasick
     return ahocorasick.Automaton()
-
-
-def _compute_wordlist_hash(filename: str, buffer_size: int = 64 * 1024) -> str:
-    """
-    We compute the hash based on the file contents, rather than the filename itself, since we
-    want to know if the underlying contents of the file changes.
-
-    This is akin to:
-        $ sha1sum <filename>
-    """
-    sha1 = hashlib.sha1()
-    with open(filename, 'rb') as f:
-        data = f.read(buffer_size)
-        while data:
-            sha1.update(data)
-            data = f.read(buffer_size)
-
-    return sha1.hexdigest()
diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py
index dbf0e0e72..f67c84b0d 100644
--- a/detect_secrets/plugins/keyword.py
+++ b/detect_secrets/plugins/keyword.py
@@ -50,101 +50,7 @@
     'secret',
     'secrete',
 )
-'''
-Deprecated false positives list. This will be migrated soon.
-FALSE_POSITIVES = {
-    '""',
-    '""):',
-    '"\'',
-    '")',
-    '"dummy',
-    '"replace',
-    '"this',
-    '#pass',
-    '#password',
-    '$(shell',
-    "'\"",
-    "''",
-    "''):",
-    "')",
-    "'dummy",
-    "'replace",
-    "'this",
-    '(nsstring',
-    '-default}',
-    '::',
-    '<%=',
-    '<?php',
-    '<a',
-    '<aws_secret_access_key>',
-    '<input',
-    '<password>',
-    '<redacted>',
-    '<secret',
-    '>',
-    '=',
-    '\\"$(shell',
-    '\\k.*"',
-    "\\k.*'",
-    '`cat',
-    '`grep',
-    '`sudo',
-    'account_password',
-    'api_key',
-    'disable',
-    'dummy_secret',
-    'dummy_value',
-    'false',
-    'false):',
-    'false,',
-    'false;',
-    'login_password',
-    'none',
-    'none,',
-    'none}',
-    'nopasswd',
-    'not',
-    'not_real_key',
-    'null',
-    'null,',
-    'null.*"',
-    "null.*'",
-    'null;',
-    'pass',
-    'pass)',
-    'password',
-    'password)',
-    'password))',
-    'password,',
-    'password},',
-    'prompt',
-    'redacted',
-    'secret',
-    'some_key',
-    'str',
-    'str_to_sign',
-    'string',
-    'string)',
-    'string,',
-    'string;',
-    'string?',
-    'string?)',
-    'string}',
-    'string}}',
-    'test',
-    'test-access-key',
-    'thisisnottherealsecret',
-    'todo',
-    'true',
-    'true):',
-    'true,',
-    'true;',
-    'undef',
-    'undef,',
-    '{',
-    '{{',
-}
-'''
+
 # Includes ], ', " as closing
 CLOSING = r'[]\'"]{0,2}'
 DENYLIST_REGEX = r'|'.join(DENYLIST)
diff --git a/detect_secrets/settings.py b/detect_secrets/settings.py
index 17ed6ef2d..5a4664780 100644
--- a/detect_secrets/settings.py
+++ b/detect_secrets/settings.py
@@ -43,6 +43,15 @@ def configure_settings_from_baseline(baseline: Dict[str, Any], filename: str = '
                 file_hash=config['file_hash'],
             )
 
+        if 'detect_secrets.filters.gibberish.should_exclude_secret' in settings.filters:
+            config = settings.filters['detect_secrets.filters.gibberish.should_exclude_secret']
+
+            from detect_secrets import filters
+            filters.gibberish.initialize(
+                model_path=config.get('model'),
+                limit=config['limit'],
+            )
+
     if filename:
         settings.filters['detect_secrets.filters.common.is_baseline_file'] = {
             'filename': filename,
@@ -129,6 +138,7 @@ def configure_plugins(self, config: List[Dict[str, Any]]) -> 'Settings':
             name = plugin.pop('name')
             self.plugins[name] = plugin
 
+        get_plugins.cache_clear()
         return self
 
     def disable_plugins(self, *plugin_names: str) -> 'Settings':
@@ -138,6 +148,7 @@ def disable_plugins(self, *plugin_names: str) -> 'Settings':
             except KeyError:
                 pass
 
+        get_plugins.cache_clear()
         return self
 
     def configure_filters(self, config: List[Dict[str, Any]]) -> 'Settings':
@@ -162,12 +173,14 @@ def configure_filters(self, config: List[Dict[str, Any]]) -> 'Settings':
             path = filter_config['path']
             self.filters[path] = filter_config
 
+        get_filters.cache_clear()
         return self
 
     def disable_filters(self, *filter_paths: str) -> 'Settings':
         for filter_path in filter_paths:
             self.filters.pop(filter_path, None)
 
+        get_filters.cache_clear()
         return self
 
     def json(self) -> Dict[str, Any]:
diff --git a/docs/filters.md b/docs/filters.md
index f0e721457..0160c1aa4 100644
--- a/docs/filters.md
+++ b/docs/filters.md
@@ -46,6 +46,7 @@ the `detect_secrets.filters` namespace.
 | `common.is_invalid_file`                         | Ignores files that are not files (e.g. links).                                      |
 | `common.is_baseline_file`                        | Ignores the baseline file itself.                                                   |
 | `common.is_ignored_due_to_verification_policies` | Powers secret verification functionality.                                           |
+| `gibberish.should_exclude_secret`                | Excludes secrets that are not gibberish looking strings.                            |
 | `heuristic.is_indirect_reference`                | Primarily for `KeywordDetector`, filters secrets like `secret = get_secret_key()`.  |
 | `heuristic.is_likely_id_string`                  | Ignores secret values prefixed with `id`.                                           |
 | `heuristic.is_non_text_file`                     | Ignores non-text files (e.g. archives, images).                                     |
diff --git a/requirements-dev-minimal.txt b/requirements-dev-minimal.txt
index 66f45e063..4d2fb9892 100644
--- a/requirements-dev-minimal.txt
+++ b/requirements-dev-minimal.txt
@@ -2,6 +2,7 @@
 # on python 3.6.0 (xenial)
 coverage<5
 flake8==3.5.0
+gibberish-detector>0.1.1
 monotonic
 mypy
 pre-commit
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 20b8fd98d..5853095ee 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,6 +7,7 @@ coverage==4.5.4
 distlib==0.3.1
 filelock==3.0.12
 flake8==3.5.0
+gibberish-detector==0.1.1
 identify==1.5.10
 idna==2.10
 importlib-metadata==2.1.1
diff --git a/setup.py b/setup.py
index e19b75fe9..49f16f820 100644
--- a/setup.py
+++ b/setup.py
@@ -34,10 +34,14 @@ def get_version():
         'pyyaml',
         'requests',
     ],
+    include_package_data=True,
     extras_require={
         'word_list': [
             'pyahocorasick',
         ],
+        'gibberish': [
+            'gibberish-detector',
+        ],
     },
     entry_points={
         'console_scripts': [
diff --git a/testing/mocks.py b/testing/mocks.py
index f1ec173fc..120c3b87d 100644
--- a/testing/mocks.py
+++ b/testing/mocks.py
@@ -71,3 +71,19 @@ def debug(self, message: str, *args: Any) -> None:
     @property
     def debug_messages(self) -> str:        # pragma: no cover
         return self.messages['debug']
+
+
+@contextmanager
+def disable_gibberish_filter():
+    """
+    Unfortunately, we can't just use `Settings.disable_filters`, since `parse_args` is
+    the function that *enables* this filter. Therefore, for test cases that test through
+    the `main` function flow, we can't disable the filter before the function call.
+
+    However, since this only happens in test environments, we can just mock it out.
+    """
+    with mock.patch(
+        'detect_secrets.filters.gibberish.is_feature_enabled',
+        return_value=False,
+    ):
+        yield
diff --git a/tests/filters/common_filter_test.py b/tests/filters/common_filter_test.py
index 81cd8e9a4..be6bc6032 100644
--- a/tests/filters/common_filter_test.py
+++ b/tests/filters/common_filter_test.py
@@ -14,7 +14,7 @@ class TestVerify:
     @staticmethod
     def test_does_not_verify_if_no_verify():
         with register_plugin(MockPlugin(should_verify=False)):
-            main_module.main(['scan', '--string', 'fake-secret', '--no-verify'])
+            main_module.main(['scan', '--string', 'deadbeef', '--no-verify'])
 
     @staticmethod
     @pytest.mark.parametrize(
@@ -30,7 +30,7 @@ def test_adheres_to_verification_policies(args, verified_result, should_be_prese
         with register_plugin(
             MockPlugin(verified_result=verified_result),
         ), mock_printer(main_module) as printer:
-            main_module.main(['scan', '--string', 'fake-secret', *args])
+            main_module.main(['scan', '--string', 'deadbeef', *args])
 
         for line in printer.message.splitlines():
             plugin_name, result = [x.strip() for x in line.split(':')]
@@ -54,7 +54,8 @@ def test_handles_request_error_gracefully():
 
 class MockPlugin(RegexBasedDetector):
     denylist = (
-        re.compile('fake-secret'),
+        # We use a hex string here, due to the gibberish detector.
+        re.compile('deadbeef'),
     )
     secret_type = 'mock plugin'
 
diff --git a/tests/filters/gibberish_filter_test.py b/tests/filters/gibberish_filter_test.py
new file mode 100644
index 000000000..a526f8ad1
--- /dev/null
+++ b/tests/filters/gibberish_filter_test.py
@@ -0,0 +1,61 @@
+import os
+
+import pytest
+
+from detect_secrets import filters
+from detect_secrets.plugins.private_key import PrivateKeyDetector
+from detect_secrets.settings import transient_settings
+
+
+class TestShouldExcludeSecret:
+    @staticmethod
+    @pytest.fixture(autouse=True)
+    def initialize():
+        filters.gibberish.initialize(
+            model_path=os.path.join(filters.gibberish.__path__[0], 'rfc.model'),
+        )
+
+        try:
+            yield
+        finally:
+            filters.gibberish.get_model.cache_clear()
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        'secret',
+        (
+            'this-is-a-bad-password',
+
+            # URLs (which have been traditionally picked up by Base64HighEntropyString)
+            # are now excluded!
+            '/biz_user/NCIygBmcWTENrE1n06oprA/business_ids/v1',
+
+            # same thing with long strings
+            'k8s-KUBE_CLUSTER-ca/issue/k8s-prometheus-adapter',
+        ),
+    )
+    def test_success(secret):
+        assert filters.gibberish.should_exclude_secret(secret)
+
+    @staticmethod
+    def test_ignores_hex_strings():
+        assert not filters.gibberish.should_exclude_secret('2b00042f7481c7b056c4b410d28f33cf')
+
+    @staticmethod
+    def test_does_not_affect_private_keys():
+        assert not filters.gibberish.should_exclude_secret(
+            'BEGIN PRIVATE KEY',
+            plugin=PrivateKeyDetector(),
+        )
+
+
+def test_load_from_baseline():
+    with transient_settings({
+        'filters_used': [{
+            'path': 'detect_secrets.filters.gibberish.should_exclude_secret',
+            'model': os.path.join(filters.gibberish.__path__[0], 'rfc.model'),
+            'file_hash': '00b672f709e9bf51fe2e09abe247ac3b6415d645',
+            'limit': 3.7,
+        }],
+    }):
+        assert filters.gibberish.should_exclude_secret('clearly-not-a-secret')
diff --git a/tests/main_test.py b/tests/main_test.py
index 928cc1aab..187b401a2 100644
--- a/tests/main_test.py
+++ b/tests/main_test.py
@@ -9,6 +9,7 @@
 from detect_secrets.core.secrets_collection import SecretsCollection
 from detect_secrets.main import scan_adhoc_string
 from detect_secrets.settings import transient_settings
+from testing.mocks import disable_gibberish_filter
 from testing.mocks import mock_printer
 
 
@@ -130,7 +131,7 @@ def test_supports_stdin():
             ],
         }), mock_stdin(
             'AKIATESTTESTTESTTEST',
-        ), mock_printer(main_module) as printer:
+        ), mock_printer(main_module) as printer, disable_gibberish_filter():
             assert main_module.main(['scan', '--string']) == 0
 
             assert printer.message.strip() == 'AWSKeyDetector: True  (unverified)'
diff --git a/tests/pre_commit_hook_test.py b/tests/pre_commit_hook_test.py
index 05070995a..4c44fd7fe 100644
--- a/tests/pre_commit_hook_test.py
+++ b/tests/pre_commit_hook_test.py
@@ -11,6 +11,7 @@
 from detect_secrets.core.secrets_collection import SecretsCollection
 from detect_secrets.pre_commit_hook import main
 from detect_secrets.settings import transient_settings
+from testing.mocks import disable_gibberish_filter
 
 
 @pytest.fixture(autouse=True)
@@ -50,31 +51,33 @@ def test_baseline_filters_out_known_secrets():
     secrets = SecretsCollection()
     secrets.scan_file('test_data/each_secret.py')
 
-    with tempfile.NamedTemporaryFile() as f:
-        baseline.save_to_file(secrets, f.name)
-        f.seek(0)
+    assert secrets
 
-        # This succeeds, because all the secrets are known.
-        assert_commit_succeeds([
-            'test_data/each_secret.py',
-            '--baseline',
-            f.name,
-        ])
+    with disable_gibberish_filter():
+        with tempfile.NamedTemporaryFile() as f:
+            baseline.save_to_file(secrets, f.name)
+            f.seek(0)
 
-    # Remove one arbitrary secret, so that it won't be the full set.
-    secrets.data['test_data/each_secret.py'].pop()
+            # This succeeds, because all the secrets are known.
+            assert_commit_succeeds([
+                'test_data/each_secret.py',
+                '--baseline',
+                f.name,
+            ])
 
-    with tempfile.NamedTemporaryFile() as f:
-        baseline.save_to_file(secrets, f.name)
-        f.seek(0)
+        # Remove one arbitrary secret, so that it won't be the full set.
+        secrets.data['test_data/each_secret.py'].pop()
 
-        # Test that it isn't the case that a baseline is provided, and everything passes.
-        # import pdb; pdb.set_trace()
-        assert_commit_blocked([
-            'test_data/each_secret.py',
-            '--baseline',
-            f.name,
-        ])
+        with tempfile.NamedTemporaryFile() as f:
+            baseline.save_to_file(secrets, f.name)
+            f.seek(0)
+
+            # Test that it isn't the case that a baseline is provided, and everything passes.
+            assert_commit_blocked([
+                'test_data/each_secret.py',
+                '--baseline',
+                f.name,
+            ])
 
 
 class TestModifiesBaselineFromVersionChange: