🔭[Keyword Plugin] Precision improvements

Made quotes required in Python files/added regexes for this Added a Filetype Enum and `determine_file_type` function Replaced 'pass' with 'db_pass' in BLACKLIST Added 'aws_secret_access_key' to BLACKLIST Added some trailing char cases to FALSE_POSITIVES :boom: Changed secret_type to 'Secret Keyword'
Yelp · Dec 28, 2018 · e01d818 · e01d818
1 parent f15366e
commit e01d818
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 30 deletions.
diff --git a/detect_secrets/core/audit.py b/detect_secrets/core/audit.py
@@ -9,6 +9,7 @@
 
 from ..plugins.core import initialize
 from ..plugins.high_entropy_strings import HighEntropyStringsPlugin
+from ..plugins.keyword import determine_file_type
 from ..plugins.keyword import KeywordDetector
 from .baseline import format_baseline_for_output
 from .baseline import merge_results
@@ -554,7 +555,7 @@ def _highlight_secret(
     for raw_secret in _raw_secret_generator(
         plugin,
         secret_line,
-        is_php_file=filename.endswith('.php'),
+        filetype=determine_file_type(filename),
     ):
         secret_obj = PotentialSecret(
             plugin.secret_type,
@@ -583,10 +584,10 @@ def _highlight_secret(
     )
 
 
-def _raw_secret_generator(plugin, secret_line, is_php_file):
+def _raw_secret_generator(plugin, secret_line, filetype):
     """Generates raw secrets by re-scanning the line, with the specified plugin"""
     if isinstance(plugin, KeywordDetector):
-        for raw_secret in plugin.secret_generator(secret_line, is_php_file):
+        for raw_secret in plugin.secret_generator(secret_line, filetype=filetype):
             yield raw_secret
     else:
         for raw_secret in plugin.secret_generator(secret_line):

diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py
@@ -27,6 +27,7 @@
 from __future__ import absolute_import
 
 import re
+from enum import Enum
 
 from .base import BasePlugin
 from detect_secrets.core.potential_secret import PotentialSecret
@@ -36,7 +37,8 @@
 BLACKLIST = (
     'apikey',
     'api_key',
-    'pass',
+    'aws_secret_access_key',
+    'db_pass',
     'password',
     'passwd',
     'private_key',
@@ -49,24 +51,40 @@
     '""',
     '""):',
     'false',
+    'false):',
     'none',
+    'none,',
+    'none}',
     'not',
     'password)',
     'password},',
     'true',
+    'true):',
 )
 FOLLOWED_BY_COLON_RE = re.compile(
-    # e.g. token:
+    # e.g. api_key: foo
     r'({})(("|\')?):(\s*?)(("|\')?)([^\s]+)(\5)'.format(
         r'|'.join(BLACKLIST),
     ),
 )
+FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE = re.compile(
+    # e.g. api_key: "foo"
+    r'({})(("|\')?):(\s*?)(("|\'))([^\s]+)(\5)'.format(
+        r'|'.join(BLACKLIST),
+    ),
+)
 FOLLOWED_BY_EQUAL_SIGNS_RE = re.compile(
-    # e.g. my_password =
+    # e.g. my_password = bar
     r'({})()(\s*?)=(\s*?)(("|\')?)([^\s]+)(\5)'.format(
         r'|'.join(BLACKLIST),
     ),
 )
+FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE = re.compile(
+    # e.g. my_password = "bar"
+    r'({})()(\s*?)=(\s*?)(("|\'))([^\s]+)(\5)'.format(
+        r'|'.join(BLACKLIST),
+    ),
+)
 FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE = re.compile(
     # e.g. private_key "something";
     r'({})([^\s]*?)(\s*?)("|\')([^\s]+)(\4);'.format(
@@ -78,21 +96,45 @@
     FOLLOWED_BY_EQUAL_SIGNS_RE: 7,
     FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE: 5,
 }
+PYTHON_BLACKLIST_REGEX_TO_GROUP = {
+    FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE: 7,
+    FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE: 7,
+    FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE: 5,
+}
+
+
+class FileType(Enum):
+    PYTHON = 1
+    PHP = 2
+    OTHER = 3
+
+
+def determine_file_type(filename):
+    """
+    :param filename: str
+
+    :rtype: FileType
+    """
+    if filename.endswith('.py'):
+        return FileType.PYTHON
+    elif filename.endswith('.php'):
+        return FileType.PHP
+    return FileType.OTHER
 
 
 class KeywordDetector(BasePlugin):
     """This checks if blacklisted keywords
     are present in the analyzed string.
     """
 
-    secret_type = 'Password'
+    secret_type = 'Secret Keyword'
 
     def analyze_string(self, string, line_num, filename):
         output = {}
 
         for identifier in self.secret_generator(
             string,
-            is_php_file=filename.endswith('.php'),
+            filetype=determine_file_type(filename),
         ):
             secret = PotentialSecret(
                 self.secret_type,
@@ -104,26 +146,35 @@ def analyze_string(self, string, line_num, filename):
 
         return output
 
-    def secret_generator(self, string, is_php_file):
+    def secret_generator(self, string, filetype):
         lowered_string = string.lower()
 
-        for REGEX, group_number in BLACKLIST_REGEX_TO_GROUP.items():
+        if filetype == FileType.PYTHON:
+            blacklist_RE_to_group = PYTHON_BLACKLIST_REGEX_TO_GROUP
+        else:
+            blacklist_RE_to_group = BLACKLIST_REGEX_TO_GROUP
+
+        for REGEX, group_number in blacklist_RE_to_group.items():
             match = REGEX.search(lowered_string)
             if match:
                 lowered_secret = match.group(group_number)
 
                 # ([^\s]+) guarantees lowered_secret is not ''
-                if not probably_false_positive(lowered_secret, is_php_file):
+                if not probably_false_positive(
+                    lowered_secret,
+                    filetype=filetype,
+                ):
                     yield lowered_secret
 
 
-def probably_false_positive(lowered_secret, is_php_file):
+def probably_false_positive(lowered_secret, filetype):
     if (
         'fake' in lowered_secret or
+        'self.' in lowered_secret or
         lowered_secret in FALSE_POSITIVES or
         # If it is a .php file, do not report $variables
         (
-            is_php_file and
+            filetype == FileType.PHP and
             lowered_secret[0] == '$'
         )
     ):

diff --git a/testing/factories.py b/testing/factories.py
@@ -6,7 +6,7 @@
 
 def potential_secret_factory(type_='type', filename='filename', secret='secret', lineno=1):
     """This is only marginally better than creating PotentialSecret objects directly,
-    because of default values.
+    because of the default values.
     """
     return PotentialSecret(type_, filename, secret, lineno)
 

diff --git a/tests/core/audit_test.py b/tests/core/audit_test.py
@@ -605,7 +605,7 @@ def test_secret_not_found(self, mock_printer):
 
         """)[1:-1]
 
-    def test_secret_in_yaml_file(self, mock_printer):
+    def test_hex_high_entropy_secret_in_yaml_file(self, mock_printer):
         with self._mock_sed_call(
             line_containing_secret='api key: 123456789a',
         ):
@@ -644,6 +644,44 @@ def test_secret_in_yaml_file(self, mock_printer):
 
         """)[1:-1]
 
+    def test_keyword_secret_in_yaml_file(self, mock_printer):
+        with self._mock_sed_call(
+            line_containing_secret='api_key: yerba',
+        ):
+            self.run_logic(
+                secret=potential_secret_factory(
+                    type_='Secret Keyword',
+                    filename='filenameB',
+                    secret='yerba',
+                    lineno=15,
+                ).json(),
+                settings=[
+                    {
+                        'name': 'KeywordDetector',
+                    },
+                ],
+            )
+
+        assert mock_printer.message == textwrap.dedent("""
+            Secret:      1 of 2
+            Filename:    filenameB
+            Secret Type: Secret Keyword
+            ----------
+            10:a
+            11:b
+            12:c
+            13:d
+            14:e
+            15:api_key: yerba
+            16:e
+            17:d
+            18:c
+            19:b
+            20:a
+            ----------
+
+        """)[1:-1]
+
 
 class TestGetUserDecision(object):
 

diff --git a/tests/plugins/keyword_test.py b/tests/plugins/keyword_test.py
@@ -14,20 +14,53 @@ class TestKeywordDetector(object):
         'file_content',
         [
             # FOLLOWED_BY_COLON_RE
+            "'theapikey': 'ho)pe]nob(ody[finds>-_$#thisone'",
+            '"theapikey": "ho)pe]nob(ody[finds>-_$#thisone"',
+            'apikey: ho)pe]nob(ody[finds>-_$#thisone',
+            'apikey:ho)pe]nob(ody[finds>-_$#thisone',
+            'theapikey:ho)pe]nob(ody[finds>-_$#thisone',
+            'apikey: "ho)pe]nob(ody[finds>-_$#thisone"',
+            "apikey:  'ho)pe]nob(ody[finds>-_$#thisone'",
+            # FOLLOWED_BY_EQUAL_SIGNS_RE
+            'my_password=ho)pe]nob(ody[finds>-_$#thisone',
+            'my_password= ho)pe]nob(ody[finds>-_$#thisone',
+            'my_password =ho)pe]nob(ody[finds>-_$#thisone',
+            'my_password = ho)pe]nob(ody[finds>-_$#thisone',
+            'my_password =ho)pe]nob(ody[finds>-_$#thisone',
+            'the_password=ho)pe]nob(ody[finds>-_$#thisone\n',
+            'the_password= "ho)pe]nob(ody[finds>-_$#thisone"\n',
+            'the_password=\'ho)pe]nob(ody[finds>-_$#thisone\'\n',
+            # FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE
+            'apikey "ho)pe]nob(ody[finds>-_$#thisone";',  # Double-quotes
+            'fooapikeyfoo "ho)pe]nob(ody[finds>-_$#thisone";',  # Double-quotes
+            'fooapikeyfoo"ho)pe]nob(ody[finds>-_$#thisone";',  # Double-quotes
+            'private_key \'ho)pe]nob(ody[finds>-_$#thisone\';',  # Single-quotes
+            'fooprivate_keyfoo\'ho)pe]nob(ody[finds>-_$#thisone\';',  # Single-quotes
+            'fooprivate_key\'ho)pe]nob(ody[finds>-_$#thisone\';',  # Single-quotes
+        ],
+    )
+    def test_analyze_positives(self, file_content):
+        logic = KeywordDetector()
+
+        f = mock_file_object(file_content)
+        output = logic.analyze(f, 'mock_filename')
+        assert len(output) == 1
+        for potential_secret in output:
+            assert 'mock_filename' == potential_secret.filename
+            assert (
+                potential_secret.secret_hash
+                == PotentialSecret.hash_secret('ho)pe]nob(ody[finds>-_$#thisone')
+            )
+
+    @pytest.mark.parametrize(
+        'file_content',
+        [
+            # FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE
             "'theapikey': 'hope]nobody[finds>-_$#thisone'",
             '"theapikey": "hope]nobody[finds>-_$#thisone"',
-            'apikey: hope]nobody[finds>-_$#thisone',
-            'apikey:hope]nobody[finds>-_$#thisone',
-            'theapikey:hope]nobody[finds>-_$#thisone',
             'apikey: "hope]nobody[finds>-_$#thisone"',
             "apikey:  'hope]nobody[finds>-_$#thisone'",
-            # FOLLOWED_BY_EQUAL_SIGNS_RE
-            'my_password=hope]nobody[finds>-_$#thisone',
-            'my_password= hope]nobody[finds>-_$#thisone',
-            'my_password =hope]nobody[finds>-_$#thisone',
-            'my_password = hope]nobody[finds>-_$#thisone',
-            'my_password =hope]nobody[finds>-_$#thisone',
-            'the_password=hope]nobody[finds>-_$#thisone\n',
+            # FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE
             'the_password= "hope]nobody[finds>-_$#thisone"\n',
             'the_password=\'hope]nobody[finds>-_$#thisone\'\n',
             # FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE
@@ -39,14 +72,14 @@ class TestKeywordDetector(object):
             'fooprivate_key\'hope]nobody[finds>-_$#thisone\';',  # Single-quotes
         ],
     )
-    def test_analyze_positives(self, file_content):
+    def test_analyze_python_positives(self, file_content):
         logic = KeywordDetector()
 
         f = mock_file_object(file_content)
-        output = logic.analyze(f, 'mock_filename')
+        output = logic.analyze(f, 'mock_filename.py')
         assert len(output) == 1
         for potential_secret in output:
-            assert 'mock_filename' == potential_secret.filename
+            assert 'mock_filename.py' == potential_secret.filename
             assert (
                 potential_secret.secret_hash
                 == PotentialSecret.hash_secret('hope]nobody[finds>-_$#thisone')
@@ -66,7 +99,6 @@ def test_analyze_positives(self, file_content):
             'theapikey: "somefakekey"',  # 'fake' in the secret
             'theapikeyforfoo:hopenobodyfindsthisone',  # Characters between apikey and :
             # FOLLOWED_BY_EQUAL_SIGNS_RE
-            '$password = $input;',  # Skip anything starting with $ in php files
             'some_key = "real_secret"',  # We cannot make 'key' a Keyword, too noisy
             'my_password = foo(hey)you',  # Has a ( followed by a )
             "my_password = request.json_body['hey']",  # Has a [ followed by a ]
@@ -76,11 +108,49 @@ def test_analyze_positives(self, file_content):
             'my_password = "fakesecret"',  # 'fake' in the secret
             'login(username=username, password=password)',  # secret is password)
             'open(self, password = ""):',  # secrets is ""):
+            # '',
         ],
     )
     def test_analyze_negatives(self, file_content):
         logic = KeywordDetector()
 
+        f = mock_file_object(file_content)
+        output = logic.analyze(f, 'mock_filename.foo')
+        assert len(output) == 0
+
+    @pytest.mark.parametrize(
+        'file_content',
+        [
+            # FOLLOWED_BY_EQUAL_SIGNS_RE
+            '$password = $input;',  # Skip anything starting with $ in php files
+        ],
+    )
+    def test_analyze_php_negatives(self, file_content):
+        logic = KeywordDetector()
+
         f = mock_file_object(file_content)
         output = logic.analyze(f, 'mock_filename.php')
         assert len(output) == 0
+
+    @pytest.mark.parametrize(
+        'file_content',
+        [
+            # FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE
+            'apikey: hope]nobody[finds>-_$#thisone',  # No quotes
+            'apikey:hope]nobody[finds>-_$#thisone',  # No quotes
+            'theapikey:hope]nobody[finds>-_$#thisone',  # No quotes
+            # FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE
+            'my_password=hope]nobody[finds>-_$#thisone',  # No quotes
+            'my_password= hope]nobody[finds>-_$#thisone',  # No quotes
+            'my_password =hope]nobody[finds>-_$#thisone',  # No quotes
+            'my_password = hope]nobody[finds>-_$#thisone',  # No quotes
+            'my_password =hope]nobody[finds>-_$#thisone',  # No quotes
+            'the_password=hope]nobody[finds>-_$#thisone\n',  # No quotes
+        ],
+    )
+    def test_analyze_python_negatives(self, file_content):
+        logic = KeywordDetector()
+
+        f = mock_file_object(file_content)
+        output = logic.analyze(f, 'mock_filename.py')
+        assert len(output) == 0
diff --git a/tox.ini b/tox.ini
@@ -12,7 +12,7 @@ commands =
     coverage erase
     coverage run -m pytest tests
     coverage report --show-missing --include=tests/* --fail-under 100
-    coverage report --show-missing --include=detect_secrets/* --fail-under 96
+    coverage report --show-missing --include=detect_secrets/* --fail-under 97
     pre-commit run --all-files
 
 [testenv:venv]