Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support url-safe base64 secrets #245

Merged
merged 12 commits into from
Oct 24, 2019
2 changes: 1 addition & 1 deletion detect_secrets/core/secrets_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def _extract_secrets_from_patch(self, f, plugin, filename):
for line in chunk.target_lines():
if line.is_added:
output.update(
plugin.analyze_string(
plugin.analyze_line(
line.value,
line.target_line_no,
filename,
Expand Down
10 changes: 5 additions & 5 deletions detect_secrets/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def analyze(self, file, filename):
potential_secrets = {}
file_lines = tuple(file.readlines())
for line_num, line in enumerate(file_lines, start=1):
results = self.analyze_string(line, line_num, filename)
results = self.analyze_line(line, line_num, filename)
if not self.should_verify:
potential_secrets.update(results)
continue
Expand All @@ -121,7 +121,7 @@ def analyze(self, file, filename):

return potential_secrets

def analyze_string(self, string, line_num, filename):
def analyze_line(self, string, line_num, filename):
"""
:param string: string; the line to analyze
:param line_num: integer; line number that is currently being analyzed
Expand Down Expand Up @@ -163,7 +163,7 @@ def analyze_string_content(self, string, line_num, filename):
@abstractmethod
def secret_generator(self, string, *args, **kwargs):
"""Flags secrets in a given string, and yields the raw secret value.
Used in self.analyze_string for PotentialSecret creation.
Used in self.analyze_line for PotentialSecret creation.

:type string: str
:param string: the secret to scan
Expand All @@ -178,7 +178,7 @@ def adhoc_scan(self, string):
check what different plugins say regarding a single line/secret. This
supports that.

This is very similar to self.analyze_string, but allows the flexibility
This is very similar to self.analyze_line, but allows the flexibility
for subclasses to add any other notable info (rather than just a
PotentialSecret type). e.g. HighEntropyStrings adds their Shannon
entropy in which they made their decision.
Expand All @@ -191,7 +191,7 @@ def adhoc_scan(self, string):
<classname>: <returned-value>
"""
# TODO: Handle multiple secrets on single line.
results = self.analyze_string(
results = self.analyze_line(
string,
line_num=0,
filename='does_not_matter',
Expand Down
122 changes: 100 additions & 22 deletions detect_secrets/plugins/common/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,13 @@
This abstraction allows for development of later ML work, or further
heuristical determinations (e.g. word filter, entropy comparator).
"""
import re
import string

from detect_secrets.util import is_python_2


def is_false_positive(secret, automaton):
"""
:type secret: str

:type automaton: ahocorasick.Automaton|None
:param automaton: optional automaton for ignoring certain words.

:rtype: bool
Returns True if any false positive heuristic function returns True.
"""
return any(
func(secret, automaton)
for func in
(
_is_found_with_aho_corasick,
_is_sequential_string,
)
)


def _is_found_with_aho_corasick(secret, automaton):
def is_found_with_aho_corasick(secret, automaton):
"""
:type secret: str

Expand All @@ -53,7 +34,7 @@ def _is_found_with_aho_corasick(secret, automaton):
return False


def _is_sequential_string(secret, *args):
def is_sequential_string(secret, *args):
"""
:type secret: str

Expand Down Expand Up @@ -97,3 +78,100 @@ def _is_sequential_string(secret, *args):
return True

return False


# This only finds UUIDs which only have lowercase characters.
_UUID_REGEX = re.compile(r'[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}')


def is_potential_uuid(secret, *args):
"""
Determines if a potential secret contains any UUIDs.

:type secret: str

:rtype: bool
Returns True if the string has a UUID, false otherwise.
"""

# Using a regex to find strings that look like false-positives
# will find us more false-positives than if we just tried validate
# the input string as a UUID (for example, if the string has a prefix
# or suffix).
return len(_UUID_REGEX.findall(secret.lower())) > 0
OiCMudkips marked this conversation as resolved.
Show resolved Hide resolved


DEFAULT_FALSE_POSITIVE_HEURISTICS = [
is_found_with_aho_corasick,
is_sequential_string,
]


# NOTE: this doesn't handle multiple key-values on a line properly.
# NOTE: words that end in "id" will be treated as ids
_ID_DETECTOR_REGEX = re.compile(r'[iI][dD][^A-Za-z0-9]')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might be able to do _id, we'll see what the data says though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it depends on whether we want to ignore keys like BusinessId. I think at Yelp this isn't likely but it's probably more likely in camelCase language repos.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point, we do have a lot of python biases.



def is_likely_id_string(secret, line):
"""
:type secret: str

:type line: str
:param line: Line context for the plaintext secret

:rtype: bool
Returns true if the secret could be an id, false otherwise.
"""
if secret not in line:
return False

secret_index = line.index(secret)
return _ID_DETECTOR_REGEX.findall(line, pos=0, endpos=secret_index)


DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS = [
is_likely_id_string,
OiCMudkips marked this conversation as resolved.
Show resolved Hide resolved
]


def is_false_positive(secret, automaton, functions=DEFAULT_FALSE_POSITIVE_HEURISTICS):
"""
:type secret: str

:type automaton: ahocorasick.Automaton|None
:param automaton: optional automaton for ignoring certain words.

:type functions: Iterable[Callable]
:param functions: list of heuristics to use

:rtype: bool
Returns True if any false positive heuristic function returns True.
"""
return any(
func(secret, automaton)
for func in functions
)


def is_false_positive_with_line_context(
secret,
line,
functions=DEFAULT_FALSE_POSITIVE_WITH_LINE_CONTEXT_HEURISTICS,
):
"""
:type secret: str

:type line: str
:param line: plaintext line on which secret was found

:type functions: Iterable[Callable]
:param functions: list of heuristics to use

:rtype: bool
Returns True if any false-positive heuristic which considers the whole file line
returns true.
"""
return any(
func(secret, line)
for func in functions
)
2 changes: 1 addition & 1 deletion detect_secrets/plugins/common/ini_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def iterator(self):
key,
values,
):
yield value, offset
yield key, value, offset

def _get_value_and_line_offset(self, key, values):
"""Returns the index of the location of key, value pair in lines.
Expand Down
5 changes: 5 additions & 0 deletions detect_secrets/plugins/common/yaml_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ def _tag_dict_values(self, map_node):
value=str(value.tag.endswith(':binary')),
tag='tag:yaml.org,2002:bool',
),
self._create_key_value_pair_for_mapping_node_value(
key='__original_key__',
value=key.value,
tag='tag:yaml.org,2002:str',
),
],
)

Expand Down
75 changes: 62 additions & 13 deletions detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from .common.filetype import determine_file_type
from .common.filetype import FileType
from .common.filters import is_false_positive
from .common.filters import is_false_positive_with_line_context
from .common.filters import is_potential_uuid
from .common.filters import DEFAULT_FALSE_POSITIVE_HEURISTICS
from .common.ini_file_parser import IniFileParser
from .common.yaml_file_parser import YamlFileParser
from detect_secrets.core.potential_secret import PotentialSecret
Expand Down Expand Up @@ -83,14 +86,40 @@ def calculate_shannon_entropy(self, data):

return entropy

@staticmethod
def _filter_false_positives_with_line_ctx(potential_secrets, line):
return {
key: value for key, value in potential_secrets.items()
if not is_false_positive_with_line_context(
key.secret_value,
line,
)
}

def analyze_line(self, string, line_num, filename):
output = super(HighEntropyStringsPlugin, self).analyze_line(
string,
line_num,
filename,
)

return self._filter_false_positives_with_line_ctx(
output,
string,
)

def analyze_string_content(self, string, line_num, filename):
"""Searches string for custom pattern, and captures all high entropy strings that
match self.regex, with a limit defined as self.entropy_limit.
"""
output = {}

for result in self.secret_generator(string):
if is_false_positive(result, self.automaton):
# py2+py3 compatible way of copying a list
functions = list(DEFAULT_FALSE_POSITIVE_HEURISTICS)
functions.append(is_potential_uuid)

if is_false_positive(result, self.automaton, functions=functions):
Copy link
Collaborator

@KevinHock KevinHock Oct 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are your thoughts on passing additional_heuristics instead? I'm not sure when we would want to call is_false_positive without the defaults (main motivation is prettifying though)

Copy link
Contributor Author

@OiCMudkips OiCMudkips Oct 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually thinking about moving is_false_positive to be a method in BasePlugin and then make subclass re-implement it. This would allow us to override the filters used on a plugin-level (suggested in #250), but also set some reasonable defaults. In addition we can include the heuristics used in the configs for the plugins in baselines.

i.e. in code

class BasePlugin():
    def __init__(self, false_positive_heuristics=None):
        self.false_positive_heuristics = false_positive_heuristics if false_positive_heuristics else []

    def is_false_positive(self, potential_secret):
         return any(func(potential_secret) for func in self.false_positive_heuristics)

    def get_config(self):
         # include the fp heuristics used if applicable


class Plugin(BasePlugin):
    def __init__(self, false_positive_heuristics=DEFAULT_HEURSTICS_FOR_PLUGIN):  # I remember the default list in Python function constructor, I'll fix it in real code :)
        super(Plugin, self).__init__(false_positive_heuristics)

    def analyze_string_content(self, string):
        for potential_secret in self.secret_generator(string):
            if self.is_false_positive(string):
                continue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds great to me 🎈

I'm only unsure of the In addition we can include the heuristics used in the configs for the plugins in baselines. part, as I'm kind of okay with leaving that part blind to the user. (There are also the lesser possible objections someone could say that diffs in baselines should be minimal, and I'm not sure how we would say which heuristics each plugin used in a DRY way.)

continue

secret = PotentialSecret(self.secret_type, filename, result, line_num)
Expand All @@ -114,7 +143,7 @@ def adhoc_scan(self, string):
# Since it's an individual string, it's just bad UX to require quotes
# around the expected secret.
with self.non_quoted_string_regex():
results = self.analyze_string(
results = self.analyze_line(
string,
line_num=0,
filename='does_not_matter',
Expand Down Expand Up @@ -152,23 +181,27 @@ def _analyze_ini_file(self, add_header=False):
:returns: same format as super().analyze()
"""
def wrapped(file, filename):
potential_secrets = {}
output = {}

with self.non_quoted_string_regex():
for value, lineno in IniFileParser(
for key, value, lineno in IniFileParser(
file,
add_header,
exclude_lines_regex=self.exclude_lines_regex,
).iterator():
potential_secrets.update(
self.analyze_string(
value,
lineno,
filename,
),
potential_secrets = self.analyze_string_content(
value,
lineno,
filename,
)
line = u'{key}={value}'.format(key=key, value=value)
potential_secrets = self._filter_false_positives_with_line_ctx(
potential_secrets,
line,
)
output.update(potential_secrets)

return potential_secrets
return output

return wrapped

Expand Down Expand Up @@ -217,7 +250,7 @@ def _analyze_yaml_file(self, file, filename):
else item['__value__']
)

secrets = self.analyze_string(
secrets = self.analyze_string_content(
string_to_scan,
item['__line__'],
filename,
Expand All @@ -226,6 +259,15 @@ def _analyze_yaml_file(self, file, filename):
if item['__is_binary__']:
secrets = self._encode_yaml_binary_secrets(secrets)

dumped_key_value = yaml.dump({
item['__original_key__']: item['__value__'],
}).replace('\n', '')

secrets = self._filter_false_positives_with_line_ctx(
secrets,
dumped_key_value,
)

potential_secrets.update(secrets)

return potential_secrets
Expand Down Expand Up @@ -339,8 +381,15 @@ class Base64HighEntropyString(HighEntropyStringsPlugin):
secret_type = 'Base64 High Entropy String'

def __init__(self, base64_limit, exclude_lines_regex=None, automaton=None, **kwargs):
charset = (
string.ascii_letters
+ string.digits
+ '+/' # Regular base64
+ '\\-_' # Url-safe base64
+ '=' # Padding
)
super(Base64HighEntropyString, self).__init__(
charset=string.ascii_letters + string.digits + '+/=',
charset=charset,
limit=base64_limit,
exclude_lines_regex=exclude_lines_regex,
automaton=automaton,
Expand Down
4 changes: 4 additions & 0 deletions test_data/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ password = 12345678901234 # pragma: allowlist secret

# unicode
foo=bår

[key with id in name]
real_secret_which_isnt_an_i_d = vh987tyw9ehy8ghis7vwyhiwbwitefy7w3ASDGYDGUASDG
foreign_key_id = vh987tyw9ehy8ghis7vwyhiwbwitefy7w3ASDGYDGUASDG
8 changes: 6 additions & 2 deletions test_data/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
credentials:
some_value_here: not_a_secret
some_value_here: not_secret
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this necessary b/c the entropy calculation with the new chars alerted on not_a_secret?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't think we need to be too concerned though because we now have the wordlist filtering.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm more concerned that, we'll have large diffs in baseline's when people update detect-secrets.

This isn't as concerning as changing a secret type like we did in #26, (where all old secrets were removed and re-added), but it is a little, especially if it reduces TP's to some extent. (We'll see what the data says though, I can't really say how it'll effect signal.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why will we have large diffs? A lot of new secrets?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If all the e.g. not_a_secret potential secrets disappear from existing baselines, then there is a possibility we will have large diffs, in the case of FP's that's great, in the case of TP's that would be a regression visible to users. (We can't really say there are minimal regressions without data though.)

other_value_here: 1234567890a
CanonicalUserGetSkippedByExcludeLines: 1234567890ab
nested:
Expand All @@ -11,5 +11,9 @@ list_of_keys:
- 234567890a

test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: allowlist secret

high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg==

# this should be ignored as a potential id
allowlisted_id: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54'

uuid_should_be_ignored: '203db13e-70c7-462b-9a3d-bf32640cb0be'
4 changes: 2 additions & 2 deletions tests/plugins/artifactory_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ class TestArtifactoryDetector(object):
('artifactory:_password=AKCxxxxxxxx', False),
],
)
def test_analyze_string(self, payload, should_flag):
def test_analyze_line(self, payload, should_flag):
logic = ArtifactoryDetector()

output = logic.analyze_string(payload, 1, 'mock_filename')
output = logic.analyze_line(payload, 1, 'mock_filename')
assert len(output) == int(should_flag)
Loading