Skip to content

Commit

Permalink
Merge pull request #86 from Yelp/upgrade_keyword_detector
Browse files Browse the repository at this point in the history
Use different regexes in KeywordDetector to improve accuracy
  • Loading branch information
KevinHock committed Jan 3, 2019
2 parents 1415b4b + a29108b commit 164b7eb
Show file tree
Hide file tree
Showing 17 changed files with 421 additions and 60 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ rejected as a potential secret.
This preset amount can be adjusted in several ways:

* Specifying it within the config file, for server scanning.
* Specifying it with command line flags (eg. `--base64-limit`)
* Specifying it with command line flags (e.g. `--base64-limit`)

Lowering these limits will identify more potential secrets, but also create
more false positives. Adjust these limits to suit your needs.
32 changes: 24 additions & 8 deletions detect_secrets/core/audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from ..plugins.core import initialize
from ..plugins.high_entropy_strings import HighEntropyStringsPlugin
from ..plugins.keyword import determine_file_type
from ..plugins.keyword import KeywordDetector
from .baseline import format_baseline_for_output
from .baseline import merge_results
from .bidirectional_iterator import BidirectionalIterator
Expand Down Expand Up @@ -165,7 +167,7 @@ def compare_baselines(old_baseline_filename, new_baseline_filename):
print('Quitting...')
break

if decision == 'b': # pragma: no cover
if decision == 'b': # pragma: no cover
current_index -= 2
secret_iterator.step_back_on_next_iteration()

Expand Down Expand Up @@ -305,11 +307,11 @@ def _comparison_generator(old_list, new_list, compare_fn):
new_index += 1


def _clear_screen(): # pragma: no cover
def _clear_screen(): # pragma: no cover
subprocess.call(['clear'])


def _print_context( # pragma: no cover
def _print_context( # pragma: no cover
filename,
secret,
count,
Expand Down Expand Up @@ -518,7 +520,13 @@ def _get_secret_with_context(
)


def _highlight_secret(secret_line, secret_lineno, secret, filename, plugin_settings):
def _highlight_secret(
secret_line,
secret_lineno,
secret,
filename,
plugin_settings,
):
"""
:type secret_line: str
:param secret_line: the line on which the secret is found
Expand All @@ -544,7 +552,11 @@ def _highlight_secret(secret_line, secret_lineno, secret, filename, plugin_setti
plugin_settings,
)

for raw_secret in _raw_secret_generator(plugin, secret_line):
for raw_secret in _raw_secret_generator(
plugin,
secret_line,
filetype=determine_file_type(filename),
):
secret_obj = PotentialSecret(
plugin.secret_type,
filename,
Expand Down Expand Up @@ -572,10 +584,14 @@ def _highlight_secret(secret_line, secret_lineno, secret, filename, plugin_setti
)


def _raw_secret_generator(plugin, secret_line):
def _raw_secret_generator(plugin, secret_line, filetype):
"""Generates raw secrets by re-scanning the line, with the specified plugin"""
for raw_secret in plugin.secret_generator(secret_line):
yield raw_secret
if isinstance(plugin, KeywordDetector):
for raw_secret in plugin.secret_generator(secret_line, filetype=filetype):
yield raw_secret
else:
for raw_secret in plugin.secret_generator(secret_line):
yield raw_secret

if issubclass(plugin.__class__, HighEntropyStringsPlugin):
with plugin.non_quoted_string_regex(strict=False):
Expand Down
6 changes: 2 additions & 4 deletions detect_secrets/core/usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,9 +348,7 @@ def _add_opt_out_options(self):
plugin.disable_flag_text,
action='store_true',
help=plugin.disable_help_text,
# Temporarily disabling the KeywordDetector
# Until we can test its effectiveness on more repositories
default=True if plugin.disable_flag_text == '--no-keyword-scan' else False,
default=False,
)

return self
Expand All @@ -370,7 +368,7 @@ def _convert_flag_text_to_argument_name(flag_text):
"""This just emulates argparse's underlying logic.
:type flag_text: str
:param flag_text: eg. `--no-hex-string-scan`
:param flag_text: e.g. `--no-hex-string-scan`
:return: `no_hex_string_scan`
"""
return flag_text[2:].replace('-', '_')
3 changes: 3 additions & 0 deletions detect_secrets/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def secret_generator(self, string):
:type string: str
:param string: the secret to scan
:rtype: iter
:returns: Of all the identifiers found
"""
raise NotImplementedError

Expand Down
188 changes: 175 additions & 13 deletions detect_secrets/plugins/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,34 +26,135 @@
"""
from __future__ import absolute_import

import re
from enum import Enum

from .base import BasePlugin
from detect_secrets.core.potential_secret import PotentialSecret


# Note: All values here should be lowercase
BLACKLIST = (
# NOTE all values here should be lowercase,
# otherwise _secret_generator can fail to match them
'pass =',
'apikey',
'api_key',
'aws_secret_access_key',
'db_pass',
'password',
'passwd',
'pwd',
'private_key',
'secret',
'secrete',
'token',
)
FALSE_POSITIVES = (
"''",
"''):",
"')",
"'this",
'""',
'""):',
'")',
'<a',
'#pass',
'#password',
'<aws_secret_access_key>',
'<password>',
'dummy_secret',
'false',
'false):',
'none',
'none,',
'none}',
'not',
'null,',
'password)',
'password,',
'password},',
'string,',
'string}',
'string}}',
'test-access-key',
'true',
'true):',
'{',
)
FOLLOWED_BY_COLON_RE = re.compile(
# e.g. api_key: foo
r'({})(("|\')?):(\s*?)(("|\')?)([^\s]+)(\5)'.format(
r'|'.join(BLACKLIST),
),
)
FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE = re.compile(
# e.g. api_key: "foo"
r'({})(("|\')?):(\s*?)(("|\'))([^\s]+)(\5)'.format(
r'|'.join(BLACKLIST),
),
)
FOLLOWED_BY_EQUAL_SIGNS_RE = re.compile(
# e.g. my_password = bar
r'({})((\'|")])?()(\s*?)=(\s*?)(("|\')?)([^\s]+)(\7)'.format(
r'|'.join(BLACKLIST),
),
)
FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE = re.compile(
# e.g. my_password = "bar"
r'({})((\'|")])?()(\s*?)=(\s*?)(("|\'))([^\s]+)(\7)'.format(
r'|'.join(BLACKLIST),
),
)
FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE = re.compile(
# e.g. private_key "something";
r'({})([^\s]*?)(\s*?)("|\')([^\s]+)(\4);'.format(
r'|'.join(BLACKLIST),
),
)
BLACKLIST_REGEX_TO_GROUP = {
FOLLOWED_BY_COLON_RE: 7,
FOLLOWED_BY_EQUAL_SIGNS_RE: 9,
FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE: 5,
}
PYTHON_BLACKLIST_REGEX_TO_GROUP = {
FOLLOWED_BY_COLON_QUOTES_REQUIRED_RE: 7,
FOLLOWED_BY_EQUAL_SIGNS_QUOTES_REQUIRED_RE: 9,
FOLLOWED_BY_QUOTES_AND_SEMICOLON_RE: 5,
}


class FileType(Enum):
JAVASCRIPT = 0
PHP = 1
PYTHON = 2
OTHER = 3


def determine_file_type(filename):
"""
:param filename: str
:rtype: FileType
"""
if filename.endswith('.js'):
return FileType.JAVASCRIPT
elif filename.endswith('.py'):
return FileType.PYTHON
elif filename.endswith('.php'):
return FileType.PHP
return FileType.OTHER


class KeywordDetector(BasePlugin):
"""This checks if blacklisted keywords
are present in the analyzed string.
"""

secret_type = 'Password'
secret_type = 'Secret Keyword'

def analyze_string(self, string, line_num, filename):
output = {}

for identifier in self.secret_generator(string):
for identifier in self.secret_generator(
string,
filetype=determine_file_type(filename),
):
secret = PotentialSecret(
self.secret_type,
filename,
Expand All @@ -64,10 +165,71 @@ def analyze_string(self, string, line_num, filename):

return output

def _secret_generator(self, lowercase_string):
for line in BLACKLIST:
if line in lowercase_string:
yield line
def secret_generator(self, string, filetype):
lowered_string = string.lower()

if filetype == FileType.PYTHON:
blacklist_RE_to_group = PYTHON_BLACKLIST_REGEX_TO_GROUP
else:
blacklist_RE_to_group = BLACKLIST_REGEX_TO_GROUP

for REGEX, group_number in blacklist_RE_to_group.items():
match = REGEX.search(lowered_string)
if match:
lowered_secret = match.group(group_number)

# ([^\s]+) guarantees lowered_secret is not ''
if not probably_false_positive(
lowered_secret,
filetype=filetype,
):
yield lowered_secret


def probably_false_positive(lowered_secret, filetype):
if (
'fake' in lowered_secret
or 'forgot' in lowered_secret
or lowered_secret in FALSE_POSITIVES
or (
filetype == FileType.JAVASCRIPT
and (
lowered_secret.startswith('this.')
or lowered_secret.startswith('fs.read')
or lowered_secret == 'new'
)
) or ( # If it is a .php file, do not report $variables
filetype == FileType.PHP
and lowered_secret[0] == '$'
)
):
return True

# Heuristic for no function calls
try:
if (
lowered_secret.index('(') < lowered_secret.index(')')
):
return True
except ValueError:
pass

# Heuristic for e.g. request.json_body['hey']
try:
if (
lowered_secret.index('[') < lowered_secret.index(']')
):
return True
except ValueError:
pass

# Heuristic for e.g. ${link}
try:
if (
lowered_secret.index('${') < lowered_secret.index('}')
):
return True
except ValueError:
pass

def secret_generator(self, string):
return self._secret_generator(string.lower())
return False
2 changes: 1 addition & 1 deletion detect_secrets/pre_commit_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_baseline(baseline_filename):
)


def _get_baseline_string_from_file(filename): # pragma: no cover
def _get_baseline_string_from_file(filename): # pragma: no cover
"""Breaking this function up for mockability."""
try:
with open(filename) as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
secret = 'BEEF0123456789a'
seecret = 'BEEF0123456789a'
skipped_sequential_false_positive = '0123456789a'
print('second line')
var = 'third line'
4 changes: 2 additions & 2 deletions testing/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def potential_secret_factory(type_='type', filename='filename', secret='secret', lineno=1):
"""This is only marginally better than creating PotentialSecret objects directly,
because of default values.
because of the default values.
"""
return PotentialSecret(type_, filename, secret, lineno)

Expand All @@ -15,7 +15,7 @@ def secrets_collection_factory(secrets=None, plugins=(), exclude_regex=''):
"""
:type secrets: list(dict)
:param secrets: list of params to pass to add_secret.
Eg. [ {'secret': 'blah'}, ]
E.g. [ {'secret': 'blah'}, ]
:type plugins: tuple
:type exclude_regex: str
Expand Down
2 changes: 1 addition & 1 deletion testing/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def mock_git_calls(subprocess_namespace, cases):
:type cases: iterable(SubprocessMock)
:type subprocess_namespace: str
:param subprocess_namespace: should be the namespace referring to check_output.
Eg. `detect_secrets.pre_commit_hook.subprocess.check_output`
E.g. `detect_secrets.pre_commit_hook.subprocess.check_output`
"""
# We need to use a dictionary, because python2.7 does not support
# the `nonlocal` keyword (and needs to share scope with
Expand Down
Loading

0 comments on commit 164b7eb

Please sign in to comment.