Skip to content

Commit

Permalink
Merge pull request #16 from Yelp/handling-yaml-files
Browse files Browse the repository at this point in the history
Handling yaml files
  • Loading branch information
domanchi authored Apr 10, 2018
2 parents f3a111e + 0ee222b commit c54ad3a
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 6 deletions.
176 changes: 170 additions & 6 deletions detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import absolute_import

import math
import os
import re
import string
from contextlib import contextmanager

import yaml
from future import standard_library

from .base import BasePlugin
Expand All @@ -13,6 +15,12 @@
import configparser # noqa: E402


YAML_EXTENSIONS = (
'.yaml',
'.yml',
)


class HighEntropyStringsPlugin(BasePlugin):
"""Base class for string pattern matching"""

Expand All @@ -33,10 +41,16 @@ def __init__(self, charset, limit, *args):
self.ignore_regex = re.compile(r'# ?pragma: ?whitelist[ -]secret')

def analyze(self, file, filename):
try:
return self._analyze_ini_file(file, filename)
except configparser.Error:
file.seek(0)
file_type_analyzers = (
('_analyze_ini_file', configparser.Error,),
('_analyze_yaml_file', yaml.YAMLError,),
)

for function, error in file_type_analyzers:
try:
return getattr(self, function)(file, filename)
except error:
file.seek(0)

return super(HighEntropyStringsPlugin, self).analyze(file, filename)

Expand Down Expand Up @@ -101,7 +115,11 @@ def _analyze_ini_file(self, file, filename):
for section_name, _ in parser.items():
for key, value in parser.items(section_name):
# +1, because we don't want to double count lines
offset = self._get_line_offset(key, value, lines) + 1
offset = self._get_line_offset_for_ini_files(
key,
value,
lines
) + 1
line_offset += offset
lines = lines[offset:]

Expand All @@ -115,6 +133,44 @@ def _analyze_ini_file(self, file, filename):

return potential_secrets

def _analyze_yaml_file(self, file, filename):
"""
:returns: same format as super().analyze()
"""
if os.path.splitext(filename)[1] not in YAML_EXTENSIONS:
# The yaml parser is pretty powerful. It eagerly
# parses things when it's not even a yaml file. Therefore,
# we use this heuristic to quit early if appropriate.
raise yaml.YAMLError

data = YamlLineInjector(file).json()
potential_secrets = {}

to_search = [data]
with self._non_quoted_string_regex():
while len(to_search) > 0:
item = to_search.pop()

try:
if '__line__' in item:
potential_secrets.update(
self.analyze_string(
item['__value__'],
item['__line__'],
filename,
),
)
continue

for key in item:
obj = item[key] if isinstance(item, dict) else key
if isinstance(obj, dict):
to_search.append(obj)
except TypeError:
pass

return potential_secrets

@property
def __dict__(self):
output = super(HighEntropyStringsPlugin, self).__dict__
Expand All @@ -138,7 +194,7 @@ def _non_quoted_string_regex(self):
self.regex = old_regex

@staticmethod
def _get_line_offset(key, value, lines):
def _get_line_offset_for_ini_files(key, value, lines):
"""Returns the index of the location of key, value pair in lines.
:type key: str
Expand Down Expand Up @@ -171,3 +227,111 @@ def __init__(self, limit, *args):
string.ascii_letters + string.digits + '+/=',
limit
)


class YamlLineInjector(object):
"""
Yaml config files are interesting, because they don't necessarily conform
to our basic regex for detecting HighEntropyStrings as strings don't
need to be quoted.
This causes interesting issues, because our regex won't catch non-quoted
strings, and if we ignore the quoting requirement, then we increase our
false positive rate, because any long string would have high entropy.
Therefore, we take a different approach: intercept the parsing of the yaml
file to identify string values. This assumes:
1. Secrets are strings
2. Secrets are not keys
Then, we calculate the entropy of those string values.
The difficulty comes from determining the line number which these values
come from. To do this, we transform the string into a dictionary of
meta-tags, in the following format:
>>> {
'key': {
'__value__': value,
'__line__': <line_number>,
}
}
This way, we can quickly identify the line number for auditing at a later
stage.
This parsing method is inspired by https://stackoverflow.com/a/13319530.
"""

def __init__(self, file):
self.loader = yaml.SafeLoader(file.read())

self.loader.compose_node = self._compose_node_shim

def json(self):
return self.loader.get_single_data()

def _compose_node_shim(self, parent, index):
line = self.loader.line

node = yaml.composer.Composer.compose_node(self.loader, parent, index)
node.__line__ = line + 1

if node.tag.endswith(':map'):
return self._tag_dict_values(node)

# TODO: Not sure if need to do :seq

return node

def _tag_dict_values(self, map_node):
"""
:type map_node: yaml.nodes.MappingNode
:param map_node: It looks like map_node.value contains a list of
pair tuples, corresponding to key,value pairs.
"""
new_values = []
for key, value in map_node.value:
if not value.tag.endswith(':str'):
new_values.append((key, value,))
continue

augmented_string = yaml.nodes.MappingNode(
tag=map_node.tag,
value=[
self._create_key_value_pair_for_mapping_node_value(
'__value__',
value.value,
'tag:yaml.org,2002:str',
),
self._create_key_value_pair_for_mapping_node_value(
'__line__',
str(value.__line__),
'tag:yaml.org,2002:int',
),
],
)

new_values.append((key, augmented_string,))

output = yaml.nodes.MappingNode(
tag=map_node.tag,
value=new_values,
start_mark=map_node.start_mark,
end_mark=map_node.end_mark,
flow_style=map_node.flow_style,
)
return output

def _create_key_value_pair_for_mapping_node_value(self, key, value, tag):
return (
yaml.nodes.ScalarNode(
tag='tag:yaml.org,2002:str',
value=key,
),
yaml.nodes.ScalarNode(
tag=tag,
value=value,
),
)
9 changes: 9 additions & 0 deletions test_data/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
credentials:
some_value_here: not_a_secret
other_value_here: 1234567890a
nested:
value: abcdefghijklmnop
list_of_keys:
- 123
- 456
- 234567890a
14 changes: 14 additions & 0 deletions tests/plugins/high_entropy_strings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,20 @@ def test_ini_file(self):
'Location: test_data/files/file_with_secrets.py:3',
)

def test_yaml_file(self):
plugin = Base64HighEntropyString(3)

with open('test_data/config.yaml') as f:
secrets = plugin.analyze(f, 'test_data/config.yaml')

assert len(secrets.values()) == 2
for secret in secrets.values():
location = str(secret).splitlines()[1]
assert location in (
'Location: test_data/config.yaml:3',
'Location: test_data/config.yaml:5',
)


class TestBase64HighEntropyStrings(HighEntropyStringsTest):

Expand Down

0 comments on commit c54ad3a

Please sign in to comment.