Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling yaml files #16

Merged
merged 4 commits into from
Apr 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 170 additions & 6 deletions detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import absolute_import

import math
import os
import re
import string
from contextlib import contextmanager

import yaml
from future import standard_library

from .base import BasePlugin
Expand All @@ -13,6 +15,12 @@
import configparser # noqa: E402


YAML_EXTENSIONS = (
'.yaml',
'.yml',
)


class HighEntropyStringsPlugin(BasePlugin):
"""Base class for string pattern matching"""

Expand All @@ -33,10 +41,16 @@ def __init__(self, charset, limit, *args):
self.ignore_regex = re.compile(r'# ?pragma: ?whitelist[ -]secret')

def analyze(self, file, filename):
try:
return self._analyze_ini_file(file, filename)
except configparser.Error:
file.seek(0)
file_type_analyzers = (
('_analyze_ini_file', configparser.Error,),
('_analyze_yaml_file', yaml.YAMLError,),
)

for function, error in file_type_analyzers:
try:
return getattr(self, function)(file, filename)
except error:
file.seek(0)

return super(HighEntropyStringsPlugin, self).analyze(file, filename)

Expand Down Expand Up @@ -101,7 +115,11 @@ def _analyze_ini_file(self, file, filename):
for section_name, _ in parser.items():
for key, value in parser.items(section_name):
# +1, because we don't want to double count lines
offset = self._get_line_offset(key, value, lines) + 1
offset = self._get_line_offset_for_ini_files(
key,
value,
lines
) + 1
line_offset += offset
lines = lines[offset:]

Expand All @@ -115,6 +133,44 @@ def _analyze_ini_file(self, file, filename):

return potential_secrets

def _analyze_yaml_file(self, file, filename):
"""
:returns: same format as super().analyze()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Maybe add e.g. :raises: yaml.YAMLError and the same w/ configparser.Error to _analyze_ini_file.

"""
if os.path.splitext(filename)[1] not in YAML_EXTENSIONS:
# The yaml parser is pretty powerful. It eagerly
# parses things when it's not even a yaml file. Therefore,
# we use this heuristic to quit early if appropriate.
raise yaml.YAMLError

data = YamlLineInjector(file).json()
potential_secrets = {}

to_search = [data]
with self._non_quoted_string_regex():
while len(to_search) > 0:
item = to_search.pop()

try:
if '__line__' in item:
potential_secrets.update(
self.analyze_string(
item['__value__'],
item['__line__'],
filename,
),
)
continue

for key in item:
obj = item[key] if isinstance(item, dict) else key
if isinstance(obj, dict):
to_search.append(obj)
except TypeError:
pass

return potential_secrets

@property
def __dict__(self):
output = super(HighEntropyStringsPlugin, self).__dict__
Expand All @@ -138,7 +194,7 @@ def _non_quoted_string_regex(self):
self.regex = old_regex

@staticmethod
def _get_line_offset(key, value, lines):
def _get_line_offset_for_ini_files(key, value, lines):
"""Returns the index of the location of key, value pair in lines.

:type key: str
Expand Down Expand Up @@ -171,3 +227,111 @@ def __init__(self, limit, *args):
string.ascii_letters + string.digits + '+/=',
limit
)


class YamlLineInjector(object):
"""
Yaml config files are interesting, because they don't necessarily conform
to our basic regex for detecting HighEntropyStrings as strings don't
need to be quoted.

This causes interesting issues, because our regex won't catch non-quoted
strings, and if we ignore the quoting requirement, then we increase our
false positive rate, because any long string would have high entropy.

Therefore, we take a different approach: intercept the parsing of the yaml
file to identify string values. This assumes:

1. Secrets are strings
2. Secrets are not keys

Then, we calculate the entropy of those string values.

The difficulty comes from determining the line number which these values
come from. To do this, we transform the string into a dictionary of
meta-tags, in the following format:

>>> {
'key': {
'__value__': value,
'__line__': <line_number>,
}
}

This way, we can quickly identify the line number for auditing at a later
stage.

This parsing method is inspired by https://stackoverflow.com/a/13319530.
"""

def __init__(self, file):
self.loader = yaml.SafeLoader(file.read())

self.loader.compose_node = self._compose_node_shim

def json(self):
return self.loader.get_single_data()

def _compose_node_shim(self, parent, index):
line = self.loader.line

node = yaml.composer.Composer.compose_node(self.loader, parent, index)
node.__line__ = line + 1

if node.tag.endswith(':map'):
return self._tag_dict_values(node)

# TODO: Not sure if need to do :seq

return node

def _tag_dict_values(self, map_node):
"""
:type map_node: yaml.nodes.MappingNode
:param map_node: It looks like map_node.value contains a list of
pair tuples, corresponding to key,value pairs.
"""
new_values = []
for key, value in map_node.value:
if not value.tag.endswith(':str'):
new_values.append((key, value,))
continue

augmented_string = yaml.nodes.MappingNode(
tag=map_node.tag,
value=[
self._create_key_value_pair_for_mapping_node_value(
'__value__',
value.value,
'tag:yaml.org,2002:str',
),
self._create_key_value_pair_for_mapping_node_value(
'__line__',
str(value.__line__),
'tag:yaml.org,2002:int',
),
],
)

new_values.append((key, augmented_string,))

output = yaml.nodes.MappingNode(
tag=map_node.tag,
value=new_values,
start_mark=map_node.start_mark,
end_mark=map_node.end_mark,
flow_style=map_node.flow_style,
)
return output

def _create_key_value_pair_for_mapping_node_value(self, key, value, tag):
return (
yaml.nodes.ScalarNode(
tag='tag:yaml.org,2002:str',
value=key,
),
yaml.nodes.ScalarNode(
tag=tag,
value=value,
),
)
9 changes: 9 additions & 0 deletions test_data/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
credentials:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are your thoughts on badly formatted YAML files? If I'm reading the code right the line-numbers won't be correct but maybe we decide it doesn't matter that much.

e.g.

repos: [
    {repo: ..., sha: ..., hooks: [{id: ...}]},
]

some_value_here: not_a_secret
other_value_here: 1234567890a
nested:
value: abcdefghijklmnop
list_of_keys:
- 123
- 456
- 234567890a
14 changes: 14 additions & 0 deletions tests/plugins/high_entropy_strings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,20 @@ def test_ini_file(self):
'Location: test_data/files/file_with_secrets.py:3',
)

def test_yaml_file(self):
plugin = Base64HighEntropyString(3)

with open('test_data/config.yaml') as f:
secrets = plugin.analyze(f, 'test_data/config.yaml')

assert len(secrets.values()) == 2
for secret in secrets.values():
location = str(secret).splitlines()[1]
assert location in (
'Location: test_data/config.yaml:3',
'Location: test_data/config.yaml:5',
)


class TestBase64HighEntropyStrings(HighEntropyStringsTest):

Expand Down