Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YAML files don't support inline whitelisting #50

Merged
merged 4 commits into from
Jul 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions detect_secrets/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self, **kwargs):
if not self.secret_type:
raise ValueError('Plugins need to declare a secret_type.')

def analyze(self, file, filename): # pragma: no cover
def analyze(self, file, filename):
"""
:param file: The File object itself.
:param filename: string; filename of File object, used for creating
Expand All @@ -29,7 +29,7 @@ def analyze(self, file, filename): # pragma: no cover
return potential_secrets

@abstractmethod
def analyze_string(self, string, line_num, filename): # pragma: no cover
def analyze_string(self, string, line_num, filename):
"""
:param string: string; the line to analyze
:param line_num: integer; line number that is currently being analyzed
Expand All @@ -38,17 +38,17 @@ def analyze_string(self, string, line_num, filename): # pragma: no cover

NOTE: line_num and filename are used for PotentialSecret creation only.
"""
pass
raise NotImplementedError
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't really needed, because abc.abstractmethod will prevent it from even being initialized.


@abstractmethod
def secret_generator(self, string): # pragma: no cover
def secret_generator(self, string):
"""Flags secrets in a given string, and yields the raw secret value.
Used in self.analyze_string for PotentialSecret creation.

:type string: str
:param string: the secret to scan
"""
pass
raise NotImplementedError

@property
def __dict__(self):
Expand Down
4 changes: 4 additions & 0 deletions detect_secrets/plugins/core/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import re

# TODO: Update for not just python comments?
WHITELIST_REGEX = re.compile(r'# ?pragma: ?whitelist[ -]secret')
141 changes: 141 additions & 0 deletions detect_secrets/plugins/core/ini_file_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import configparser
import re


class IniFileParser(object):

def __init__(self, file):
self.parser = configparser.ConfigParser()
self.parser.optionxform = str
self.parser.read_file(file)

# Hacky way to keep track of line location
file.seek(0)
self.lines = list(map(lambda x: x.strip(), file.readlines()))
self.line_offset = 0

def iterator(self):
if not self.parser.sections():
# To prevent cases where it's not an ini file, but the parser
# helpfully attempts to parse everything to a DEFAULT section,
# when not explicitly provided.
raise configparser.Error

for section_name, _ in self.parser.items():
for key, values in self.parser.items(section_name):
for value, offset in self._get_value_and_line_offset(
key,
values,
):
yield value, offset

def _get_value_and_line_offset(self, key, values):
"""Returns the index of the location of key, value pair in lines.

:type key: str
:param key: key, in config file.

:type values: str
:param values: values for key, in config file. This is plural,
because you can have multiple values per key. Eg.

>>> key =
... value1
... value2

:type lines: list
:param lines: a collection of lines-so-far in file

:rtype: list(tuple)
"""
values_list = self._construct_values_list(values)
if not values_list:
return []

current_value_list_index = 0
output = []
lines_modified = False

first_line_regex = re.compile(r'^\s*{}[ :=]+{}'.format(
re.escape(key),
re.escape(values_list[current_value_list_index]),
))
comment_regex = re.compile(r'\s*[;#]')
for index, line in enumerate(self.lines):
if current_value_list_index == 0:
if first_line_regex.match(line):
output.append((
values_list[current_value_list_index],
self.line_offset + index + 1,
))

current_value_list_index += 1

continue

# Check ignored lines before checking values, because
# you can write comments *after* the value.

# Ignore blank lines
if not line.strip():
continue

# Ignore comments
if comment_regex.match(line):
continue

if current_value_list_index == len(values_list):
if index == 0:
index = 1 # don't want to count the same line again

self.line_offset += index
self.lines = self.lines[index:]
lines_modified = True

break
else:
output.append((
values_list[current_value_list_index],
self.line_offset + index + 1,
))

current_value_list_index += 1

if not lines_modified:
# No more lines left, if loop was not explicitly left.
self.lines = []

return output

@staticmethod
def _construct_values_list(values):
"""
This values_list is a strange construction, because of ini format.
We need to extract the values with the following supported format:

>>> key = value0
... value1
...
... # comment line here
... value2

given that normally, either value0 is supplied, or (value1, value2),
but still allowing for all three at once.

Furthermore, with the configparser, we will get a list of values,
and intermediate blank lines, but no comments. This means that we can't
merely use the count of values' items to heuristically "skip ahead" lines,
because we still have to manually parse through this.

Therefore, we construct the values_list in the following fashion:
1. Keep the first value (in the example, this is `value0`)
2. For all other values, ignore blank lines.
Then, we can parse through, and look for values only.
"""
values_list = values.splitlines()
return values_list[:1] + list(
filter(
lambda x: x,
values_list[1:],
),
)
133 changes: 133 additions & 0 deletions detect_secrets/plugins/core/yaml_file_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import yaml

from detect_secrets.plugins.core.constants import WHITELIST_REGEX


class YamlFileParser(object):
"""
Yaml config files are interesting, because they don't necessarily conform
to our basic regex for detecting HighEntropyStrings as strings don't
need to be quoted.

This causes interesting issues, because our regex won't catch non-quoted
strings, and if we ignore the quoting requirement, then we increase our
false positive rate, because any long string would have high entropy.

Therefore, we take a different approach: intercept the parsing of the yaml
file to identify string values. This assumes:

1. Secrets are strings
2. Secrets are not keys

Then, we calculate the entropy of those string values.

The difficulty comes from determining the line number which these values
come from. To do this, we transform the string into a dictionary of
meta-tags, in the following format:

>>> {
'key': {
'__value__': value,
'__line__': <line_number>,
}
}

This way, we can quickly identify the line number for auditing at a later
stage.

This parsing method is inspired by https://stackoverflow.com/a/13319530.
"""

def __init__(self, file):
self.content = file.read()
self.loader = yaml.SafeLoader(self.content)

self.loader.compose_node = self._compose_node_shim

def json(self):
return self.loader.get_single_data()

def _compose_node_shim(self, parent, index):
line = self.loader.line

node = yaml.composer.Composer.compose_node(self.loader, parent, index)
node.__line__ = line + 1

if node.tag.endswith(':map'):
return self._tag_dict_values(node)

# TODO: Not sure if need to do :seq

return node

def _tag_dict_values(self, map_node):
"""
:type map_node: yaml.nodes.MappingNode
:param map_node: It looks like map_node.value contains a list of
pair tuples, corresponding to key,value pairs.
"""
new_values = []
for key, value in map_node.value:
if not value.tag.endswith(':str'):
new_values.append((key, value,))
continue

augmented_string = yaml.nodes.MappingNode(
tag=map_node.tag,
value=[
self._create_key_value_pair_for_mapping_node_value(
'__value__',
value.value,
'tag:yaml.org,2002:str',
),
self._create_key_value_pair_for_mapping_node_value(
'__line__',
str(value.__line__),
'tag:yaml.org,2002:int',
),
],
)

new_values.append((key, augmented_string,))

output = yaml.nodes.MappingNode(
tag=map_node.tag,
value=new_values,
start_mark=map_node.start_mark,
end_mark=map_node.end_mark,
flow_style=map_node.flow_style,
)
return output

@staticmethod
def _create_key_value_pair_for_mapping_node_value(key, value, tag):
return (
yaml.nodes.ScalarNode(
tag='tag:yaml.org,2002:str',
value=key,
),
yaml.nodes.ScalarNode(
tag=tag,
value=value,
),
)

def get_ignored_lines(self):
"""
Return a set of integers that refer to line numbers that were
whitelisted by the user and should be ignored.

We need to parse the file separately from PyYAML parsing because
the parser drops the comments (at least up to version 3.13):
https://github.com/yaml/pyyaml/blob/a2d481b8dbd2b352cb001f07091ccf669227290f/lib3/yaml/scanner.py#L749

:return: set
"""

ignored_lines = set()

for line_number, line in enumerate(self.content.split('\n'), 1):
if WHITELIST_REGEX.search(line):
ignored_lines.add(line_number)

return ignored_lines
Loading