Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support url-safe base64 secrets #245

Merged
merged 12 commits into from
Oct 24, 2019
33 changes: 27 additions & 6 deletions detect_secrets/plugins/common/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,34 @@ def is_sequential_string(secret, *args):
return False


ALL_FALSE_POSITIVE_HEURISTICS = (
# This only finds UUIDs which only have lowercase characters.
_UUID_REGEX = re.compile(r'[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}')


def is_potential_uuid(secret, *args):
"""
Determines if a potential secret contains any UUIDs.

:type secret: str

:rtype: bool
Returns True if the string has a UUID, false otherwise.
"""

# Using a regex to find strings that look like false-positives
# will find us more false-positives than if we just tried validate
# the input string as a UUID (for example, if the string has a prefix
# or suffix).
return len(_UUID_REGEX.findall(secret.lower())) > 0
OiCMudkips marked this conversation as resolved.
Show resolved Hide resolved


DEFAULT_FALSE_POSITIVE_HEURISTICS = [
is_found_with_aho_corasick,
is_sequential_string,
)
]


# NOTE: this doesn't handle key-values on a line properly.
# NOTE: this doesn't handle multiple key-values on a line properly.
# NOTE: words that end in "id" will be treated as ids
_ID_DETECTOR_REGEX = re.compile(r'[iI][dD][^A-Za-z0-9]')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might be able to do _id, we'll see what the data says though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it depends on whether we want to ignore keys like BusinessId. I think at Yelp this isn't likely but it's probably more likely in camelCase language repos.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point, we do have a lot of python biases.


Expand All @@ -108,12 +129,12 @@ def is_likely_id_string(secret, line):
return _ID_DETECTOR_REGEX.findall(line, pos=0, endpos=secret_index)


ALL_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS = [
DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS = [
is_likely_id_string,
OiCMudkips marked this conversation as resolved.
Show resolved Hide resolved
]


def is_false_positive(secret, automaton, functions=ALL_FALSE_POSITIVE_HEURISTICS):
def is_false_positive(secret, automaton, functions=DEFAULT_FALSE_POSITIVE_HEURISTICS):
"""
:type secret: str

Expand All @@ -135,7 +156,7 @@ def is_false_positive(secret, automaton, functions=ALL_FALSE_POSITIVE_HEURISTICS
def is_false_positive_with_line_context(
secret,
line,
functions=ALL_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS,
functions=DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS,
):
"""
:type secret: str
Expand Down
8 changes: 7 additions & 1 deletion detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from .common.filetype import FileType
from .common.filters import is_false_positive
from .common.filters import is_false_positive_with_line_context
from .common.filters import is_potential_uuid
from .common.filters import DEFAULT_FALSE_POSITIVE_HEURISTICS
from .common.ini_file_parser import IniFileParser
from .common.yaml_file_parser import YamlFileParser
from detect_secrets.core.potential_secret import PotentialSecret
Expand Down Expand Up @@ -113,7 +115,11 @@ def analyze_string_content(self, string, line_num, filename):
output = {}

for result in self.secret_generator(string):
if is_false_positive(result, self.automaton):
# py2+py3 compatible way of copying a list
functions = list(DEFAULT_FALSE_POSITIVE_HEURISTICS)
functions.append(is_potential_uuid)

if is_false_positive(result, self.automaton, functions=functions):
Copy link
Collaborator

@KevinHock KevinHock Oct 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are your thoughts on passing additional_heuristics instead? I'm not sure when we would want to call is_false_positive without the defaults (main motivation is prettifying though)

Copy link
Contributor Author

@OiCMudkips OiCMudkips Oct 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually thinking about moving is_false_positive to be a method in BasePlugin and then make subclass re-implement it. This would allow us to override the filters used on a plugin-level (suggested in #250), but also set some reasonable defaults. In addition we can include the heuristics used in the configs for the plugins in baselines.

i.e. in code

class BasePlugin():
    def __init__(self, false_positive_heuristics=None):
        self.false_positive_heuristics = false_positive_heuristics if false_positive_heuristics else []

    def is_false_positive(self, potential_secret):
         return any(func(potential_secret) for func in self.false_positive_heuristics)

    def get_config(self):
         # include the fp heuristics used if applicable


class Plugin(BasePlugin):
    def __init__(self, false_positive_heuristics=DEFAULT_HEURSTICS_FOR_PLUGIN):  # I remember the default list in Python function constructor, I'll fix it in real code :)
        super(Plugin, self).__init__(false_positive_heuristics)

    def analyze_string_content(self, string):
        for potential_secret in self.secret_generator(string):
            if self.is_false_positive(string):
                continue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds great to me 🎈

I'm only unsure of the In addition we can include the heuristics used in the configs for the plugins in baselines. part, as I'm kind of okay with leaving that part blind to the user. (There are also the lesser possible objections someone could say that diffs in baselines should be minimal, and I'm not sure how we would say which heuristics each plugin used in a DRY way.)

continue

secret = PotentialSecret(self.secret_type, filename, result, line_num)
Expand Down
2 changes: 2 additions & 0 deletions test_data/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3U

# this should be ignored as a potential id
allowlisted_id: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54'

uuid_should_be_ignored: '203db13e-70c7-462b-9a3d-bf32640cb0be'
13 changes: 13 additions & 0 deletions tests/plugins/common/filters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,16 @@ def test_success(self, secret, line):
)
def test_failure(self, secret, line):
assert not filters.is_likely_id_string(secret, line)


class TestIsPotentialUuid(object):
@pytest.mark.parametrize(
'secret',
[
'3636dd46-ea21-11e9-81b4-2a2ae2dbcce4', # uuid1
'97fb0431-46ac-41df-9ef9-1a18545ce2a0', # uuid4
'prefix-3636dd46-ea21-11e9-81b4-2a2ae2dbcce4-suffix', # uuid in middle of string
],
)
def test_success(self, secret):
assert filters.is_potential_uuid(secret)