From 270a75222ffa24d1ba4bfdcafd7b892e90c614c9 Mon Sep 17 00:00:00 2001 From: Louis Trezzini Date: Wed, 4 Jul 2018 03:55:46 -0700 Subject: [PATCH 1/4] High entropy string should be whitelist-able --- test_data/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_data/config.yaml b/test_data/config.yaml index 27f9dc252..46fdd800b 100644 --- a/test_data/config.yaml +++ b/test_data/config.yaml @@ -7,3 +7,5 @@ list_of_keys: - 123 - 456 - 234567890a + +test_agent::whitelisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: whitelist secret From 42373eb2cafb8a742d93497d0b713d21e74118e4 Mon Sep 17 00:00:00 2001 From: Louis Trezzini Date: Thu, 5 Jul 2018 08:26:41 -0700 Subject: [PATCH 2/4] add whitelisted secret in python file --- test_data/files/file_with_secrets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_data/files/file_with_secrets.py b/test_data/files/file_with_secrets.py index b8049b4ed..eeaaa5707 100644 --- a/test_data/files/file_with_secrets.py +++ b/test_data/files/file_with_secrets.py @@ -1,6 +1,7 @@ #!/usr/bin/python # Will change this later. SUPER_SECRET_VALUE = 'c3VwZXIgbG9uZyBzdHJpbmcgc2hvdWxkIGNhdXNlIGVub3VnaCBlbnRyb3B5' +VERY_SECRET_TOO = 'f6CGV4aMM9zedoh3OUNbSakBymo7yplB' # pragma: whitelist secret def main(): From f31080b78f5ec5cd7b9efc25665ec52d97422ba7 Mon Sep 17 00:00:00 2001 From: Louis Trezzini Date: Fri, 6 Jul 2018 12:07:52 +0100 Subject: [PATCH 3/4] Support inline whitelisting for YAML files --- detect_secrets/plugins/base.py | 10 +- detect_secrets/plugins/core/constants.py | 4 + .../plugins/core/ini_file_parser.py | 141 +++++++++ .../plugins/core/yaml_file_parser.py | 122 ++++++++ .../plugins/high_entropy_strings.py | 288 ++---------------- tests/plugins/core/yaml_file_parser_test.py | 20 ++ 6 files changed, 313 insertions(+), 272 deletions(-) create mode 100644 detect_secrets/plugins/core/constants.py create mode 100644 detect_secrets/plugins/core/ini_file_parser.py create mode 100644 detect_secrets/plugins/core/yaml_file_parser.py create mode 100644 tests/plugins/core/yaml_file_parser_test.py diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index afae65fb5..9b108ae84 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -12,7 +12,7 @@ def __init__(self, **kwargs): if not self.secret_type: raise ValueError('Plugins need to declare a secret_type.') - def analyze(self, file, filename): # pragma: no cover + def analyze(self, file, filename): """ :param file: The File object itself. :param filename: string; filename of File object, used for creating @@ -29,7 +29,7 @@ def analyze(self, file, filename): # pragma: no cover return potential_secrets @abstractmethod - def analyze_string(self, string, line_num, filename): # pragma: no cover + def analyze_string(self, string, line_num, filename): """ :param string: string; the line to analyze :param line_num: integer; line number that is currently being analyzed @@ -38,17 +38,17 @@ def analyze_string(self, string, line_num, filename): # pragma: no cover NOTE: line_num and filename are used for PotentialSecret creation only. """ - pass + raise NotImplementedError @abstractmethod - def secret_generator(self, string): # pragma: no cover + def secret_generator(self, string): """Flags secrets in a given string, and yields the raw secret value. Used in self.analyze_string for PotentialSecret creation. :type string: str :param string: the secret to scan """ - pass + raise NotImplementedError @property def __dict__(self): diff --git a/detect_secrets/plugins/core/constants.py b/detect_secrets/plugins/core/constants.py new file mode 100644 index 000000000..1df984125 --- /dev/null +++ b/detect_secrets/plugins/core/constants.py @@ -0,0 +1,4 @@ +import re + +# TODO: Update for not just python comments? +WHITELIST_REGEX = re.compile(r'# ?pragma: ?whitelist[ -]secret') diff --git a/detect_secrets/plugins/core/ini_file_parser.py b/detect_secrets/plugins/core/ini_file_parser.py new file mode 100644 index 000000000..75be85806 --- /dev/null +++ b/detect_secrets/plugins/core/ini_file_parser.py @@ -0,0 +1,141 @@ +import configparser +import re + + +class IniFileParser(object): + + def __init__(self, file): + self.parser = configparser.ConfigParser() + self.parser.optionxform = str + self.parser.read_file(file) + + # Hacky way to keep track of line location + file.seek(0) + self.lines = list(map(lambda x: x.strip(), file.readlines())) + self.line_offset = 0 + + def iterator(self): + if not self.parser.sections(): + # To prevent cases where it's not an ini file, but the parser + # helpfully attempts to parse everything to a DEFAULT section, + # when not explicitly provided. + raise configparser.Error + + for section_name, _ in self.parser.items(): + for key, values in self.parser.items(section_name): + for value, offset in self._get_value_and_line_offset( + key, + values, + ): + yield value, offset + + def _get_value_and_line_offset(self, key, values): + """Returns the index of the location of key, value pair in lines. + + :type key: str + :param key: key, in config file. + + :type values: str + :param values: values for key, in config file. This is plural, + because you can have multiple values per key. Eg. + + >>> key = + ... value1 + ... value2 + + :type lines: list + :param lines: a collection of lines-so-far in file + + :rtype: list(tuple) + """ + values_list = self._construct_values_list(values) + if not values_list: + return [] + + current_value_list_index = 0 + output = [] + lines_modified = False + + first_line_regex = re.compile(r'^\s*{}[ :=]+{}'.format( + re.escape(key), + re.escape(values_list[current_value_list_index]), + )) + comment_regex = re.compile(r'\s*[;#]') + for index, line in enumerate(self.lines): + if current_value_list_index == 0: + if first_line_regex.match(line): + output.append(( + values_list[current_value_list_index], + self.line_offset + index + 1, + )) + + current_value_list_index += 1 + + continue + + # Check ignored lines before checking values, because + # you can write comments *after* the value. + + # Ignore blank lines + if not line.strip(): + continue + + # Ignore comments + if comment_regex.match(line): + continue + + if current_value_list_index == len(values_list): + if index == 0: + index = 1 # don't want to count the same line again + + self.line_offset += index + self.lines = self.lines[index:] + lines_modified = True + + break + else: + output.append(( + values_list[current_value_list_index], + self.line_offset + index + 1, + )) + + current_value_list_index += 1 + + if not lines_modified: + # No more lines left, if loop was not explicitly left. + self.lines = [] + + return output + + @staticmethod + def _construct_values_list(values): + """ + This values_list is a strange construction, because of ini format. + We need to extract the values with the following supported format: + + >>> key = value0 + ... value1 + ... + ... # comment line here + ... value2 + + given that normally, either value0 is supplied, or (value1, value2), + but still allowing for all three at once. + + Furthermore, with the configparser, we will get a list of values, + and intermediate blank lines, but no comments. This means that we can't + merely use the count of values' items to heuristically "skip ahead" lines, + because we still have to manually parse through this. + + Therefore, we construct the values_list in the following fashion: + 1. Keep the first value (in the example, this is `value0`) + 2. For all other values, ignore blank lines. + Then, we can parse through, and look for values only. + """ + values_list = values.splitlines() + return values_list[:1] + list( + filter( + lambda x: x, + values_list[1:], + ), + ) diff --git a/detect_secrets/plugins/core/yaml_file_parser.py b/detect_secrets/plugins/core/yaml_file_parser.py new file mode 100644 index 000000000..4ea38f361 --- /dev/null +++ b/detect_secrets/plugins/core/yaml_file_parser.py @@ -0,0 +1,122 @@ +import yaml + +from detect_secrets.plugins.core.constants import WHITELIST_REGEX + + +class YamlFileParser(object): + """ + Yaml config files are interesting, because they don't necessarily conform + to our basic regex for detecting HighEntropyStrings as strings don't + need to be quoted. + + This causes interesting issues, because our regex won't catch non-quoted + strings, and if we ignore the quoting requirement, then we increase our + false positive rate, because any long string would have high entropy. + + Therefore, we take a different approach: intercept the parsing of the yaml + file to identify string values. This assumes: + + 1. Secrets are strings + 2. Secrets are not keys + + Then, we calculate the entropy of those string values. + + The difficulty comes from determining the line number which these values + come from. To do this, we transform the string into a dictionary of + meta-tags, in the following format: + + >>> { + 'key': { + '__value__': value, + '__line__': , + } + } + + This way, we can quickly identify the line number for auditing at a later + stage. + + This parsing method is inspired by https://stackoverflow.com/a/13319530. + """ + + def __init__(self, file): + self.content = file.read() + self.loader = yaml.SafeLoader(self.content) + + self.loader.compose_node = self._compose_node_shim + + def json(self): + return self.loader.get_single_data() + + def _compose_node_shim(self, parent, index): + line = self.loader.line + + node = yaml.composer.Composer.compose_node(self.loader, parent, index) + node.__line__ = line + 1 + + if node.tag.endswith(':map'): + return self._tag_dict_values(node) + + # TODO: Not sure if need to do :seq + + return node + + def _tag_dict_values(self, map_node): + """ + :type map_node: yaml.nodes.MappingNode + :param map_node: It looks like map_node.value contains a list of + pair tuples, corresponding to key,value pairs. + """ + new_values = [] + for key, value in map_node.value: + if not value.tag.endswith(':str'): + new_values.append((key, value,)) + continue + + augmented_string = yaml.nodes.MappingNode( + tag=map_node.tag, + value=[ + self._create_key_value_pair_for_mapping_node_value( + '__value__', + value.value, + 'tag:yaml.org,2002:str', + ), + self._create_key_value_pair_for_mapping_node_value( + '__line__', + str(value.__line__), + 'tag:yaml.org,2002:int', + ), + ], + ) + + new_values.append((key, augmented_string,)) + + output = yaml.nodes.MappingNode( + tag=map_node.tag, + value=new_values, + start_mark=map_node.start_mark, + end_mark=map_node.end_mark, + flow_style=map_node.flow_style, + ) + return output + + @staticmethod + def _create_key_value_pair_for_mapping_node_value(key, value, tag): + return ( + yaml.nodes.ScalarNode( + tag='tag:yaml.org,2002:str', + value=key, + ), + yaml.nodes.ScalarNode( + tag=tag, + value=value, + ), + ) + + def get_ignored_lines(self): + ignored_lines = set() + + for line_number, line in enumerate(self.content.split('\n'), 1): + if WHITELIST_REGEX.search(line): + ignored_lines.add(line_number) + + return ignored_lines diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index 8762d3ddd..3554a6940 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -12,6 +12,9 @@ from .base import BasePlugin from detect_secrets.core.potential_secret import PotentialSecret +from detect_secrets.plugins.core.constants import WHITELIST_REGEX +from detect_secrets.plugins.core.ini_file_parser import IniFileParser +from detect_secrets.plugins.core.yaml_file_parser import YamlFileParser YAML_EXTENSIONS = ( @@ -35,20 +38,16 @@ def __init__(self, charset, limit, *args): self.entropy_limit = limit self.regex = re.compile(r'([\'"])([%s]+)(\1)' % charset) - # Allow whitelisting individual lines. - # TODO: Update for not just python comments? - self.ignore_regex = re.compile(r'# ?pragma: ?whitelist[ -]secret') - def analyze(self, file, filename): file_type_analyzers = ( - ('_analyze_ini_file', configparser.Error,), - ('_analyze_yaml_file', yaml.YAMLError,), + (self._analyze_ini_file, configparser.Error,), + (self._analyze_yaml_file, yaml.YAMLError,), ) - for function, error in file_type_analyzers: + for analyze_function, exception_class in file_type_analyzers: try: - return getattr(self, function)(file, filename) - except error: + return analyze_function(file, filename) + except exception_class: file.seek(0) return super(HighEntropyStringsPlugin, self).analyze(file, filename) @@ -58,8 +57,7 @@ def calculate_shannon_entropy(self, data): Borrowed from: http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html. - :param string: string. The word to analyze. - :param charset: string. The character set from which to calculate entropy. + :param data: string. The word to analyze. :returns: float, between 0.0 and 8.0 """ if not data: # pragma: no cover @@ -80,7 +78,7 @@ def analyze_string(self, string, line_num, filename): output = {} - if self.ignore_regex.search(string): + if WHITELIST_REGEX.search(string): return output for result in self.secret_generator(string): @@ -151,7 +149,9 @@ def _analyze_yaml_file(self, file, filename): # we use this heuristic to quit early if appropriate. raise yaml.YAMLError - data = YamlLineInjector(file).json() + parser = YamlFileParser(file) + data = parser.json() + ignored_lines = parser.get_ignored_lines() potential_secrets = {} to_search = [data] @@ -161,13 +161,14 @@ def _analyze_yaml_file(self, file, filename): try: if '__line__' in item: - potential_secrets.update( - self.analyze_string( - item['__value__'], - item['__line__'], - filename, - ), - ) + if not item['__line__'] in ignored_lines: + potential_secrets.update( + self.analyze_string( + item['__value__'], + item['__line__'], + filename, + ), + ) continue for key in item: @@ -250,250 +251,3 @@ def __dict__(self): }) return output - - -class IniFileParser(object): - - def __init__(self, file): - self.parser = configparser.ConfigParser() - self.parser.optionxform = str - self.parser.read_file(file) - - # Hacky way to keep track of line location - file.seek(0) - self.lines = list(map(lambda x: x.strip(), file.readlines())) - self.line_offset = 0 - - def iterator(self): - if not self.parser.sections(): - # To prevent cases where it's not an ini file, but the parser - # helpfully attempts to parse everything to a DEFAULT section, - # when not explicitly provided. - raise configparser.Error - - for section_name, _ in self.parser.items(): - for key, values in self.parser.items(section_name): - for value, offset in self._get_value_and_line_offset( - key, - values, - ): - yield value, offset - - def _get_value_and_line_offset(self, key, values): - """Returns the index of the location of key, value pair in lines. - - :type key: str - :param key: key, in config file. - - :type values: str - :param values: values for key, in config file. This is plural, - because you can have multiple values per key. Eg. - - >>> key = - ... value1 - ... value2 - - :type lines: list - :param lines: a collection of lines-so-far in file - - :rtype: list(tuple) - """ - values_list = self._construct_values_list(values) - if not values_list: - return [] - - current_value_list_index = 0 - output = [] - lines_modified = False - - first_line_regex = re.compile(r'^\s*{}[ :=]+{}'.format( - re.escape(key), - re.escape(values_list[current_value_list_index]), - )) - comment_regex = re.compile(r'\s*[;#]') - for index, line in enumerate(self.lines): - if current_value_list_index == 0: - if first_line_regex.match(line): - output.append(( - values_list[current_value_list_index], - self.line_offset + index + 1, - )) - - current_value_list_index += 1 - - continue - - # Check ignored lines before checking values, because - # you can write comments *after* the value. - - # Ignore blank lines - if not line.strip(): - continue - - # Ignore comments - if comment_regex.match(line): - continue - - if current_value_list_index == len(values_list): - if index == 0: - index = 1 # don't want to count the same line again - - self.line_offset += index - self.lines = self.lines[index:] - lines_modified = True - - break - else: - output.append(( - values_list[current_value_list_index], - self.line_offset + index + 1, - )) - - current_value_list_index += 1 - - if not lines_modified: - # No more lines left, if loop was not explicitly left. - self.lines = [] - - return output - - @staticmethod - def _construct_values_list(values): - """ - This values_list is a strange construction, because of ini format. - We need to extract the values with the following supported format: - - >>> key = value0 - ... value1 - ... - ... # comment line here - ... value2 - - given that normally, either value0 is supplied, or (value1, value2), - but still allowing for all three at once. - - Furthermore, with the configparser, we will get a list of values, - and intermediate blank lines, but no comments. This means that we can't - merely use the count of values' items to heuristically "skip ahead" lines, - because we still have to manually parse through this. - - Therefore, we construct the values_list in the following fashion: - 1. Keep the first value (in the example, this is `value0`) - 2. For all other values, ignore blank lines. - Then, we can parse through, and look for values only. - """ - values_list = values.splitlines() - return values_list[:1] + list( - filter( - lambda x: x, - values_list[1:], - ), - ) - - -class YamlLineInjector(object): - """ - Yaml config files are interesting, because they don't necessarily conform - to our basic regex for detecting HighEntropyStrings as strings don't - need to be quoted. - - This causes interesting issues, because our regex won't catch non-quoted - strings, and if we ignore the quoting requirement, then we increase our - false positive rate, because any long string would have high entropy. - - Therefore, we take a different approach: intercept the parsing of the yaml - file to identify string values. This assumes: - - 1. Secrets are strings - 2. Secrets are not keys - - Then, we calculate the entropy of those string values. - - The difficulty comes from determining the line number which these values - come from. To do this, we transform the string into a dictionary of - meta-tags, in the following format: - - >>> { - 'key': { - '__value__': value, - '__line__': , - } - } - - This way, we can quickly identify the line number for auditing at a later - stage. - - This parsing method is inspired by https://stackoverflow.com/a/13319530. - """ - - def __init__(self, file): - self.loader = yaml.SafeLoader(file.read()) - - self.loader.compose_node = self._compose_node_shim - - def json(self): - return self.loader.get_single_data() - - def _compose_node_shim(self, parent, index): - line = self.loader.line - - node = yaml.composer.Composer.compose_node(self.loader, parent, index) - node.__line__ = line + 1 - - if node.tag.endswith(':map'): - return self._tag_dict_values(node) - - # TODO: Not sure if need to do :seq - - return node - - def _tag_dict_values(self, map_node): - """ - :type map_node: yaml.nodes.MappingNode - :param map_node: It looks like map_node.value contains a list of - pair tuples, corresponding to key,value pairs. - """ - new_values = [] - for key, value in map_node.value: - if not value.tag.endswith(':str'): - new_values.append((key, value,)) - continue - - augmented_string = yaml.nodes.MappingNode( - tag=map_node.tag, - value=[ - self._create_key_value_pair_for_mapping_node_value( - '__value__', - value.value, - 'tag:yaml.org,2002:str', - ), - self._create_key_value_pair_for_mapping_node_value( - '__line__', - str(value.__line__), - 'tag:yaml.org,2002:int', - ), - ], - ) - - new_values.append((key, augmented_string,)) - - output = yaml.nodes.MappingNode( - tag=map_node.tag, - value=new_values, - start_mark=map_node.start_mark, - end_mark=map_node.end_mark, - flow_style=map_node.flow_style, - ) - return output - - def _create_key_value_pair_for_mapping_node_value(self, key, value, tag): - return ( - yaml.nodes.ScalarNode( - tag='tag:yaml.org,2002:str', - value=key, - ), - yaml.nodes.ScalarNode( - tag=tag, - value=value, - ), - ) diff --git a/tests/plugins/core/yaml_file_parser_test.py b/tests/plugins/core/yaml_file_parser_test.py new file mode 100644 index 000000000..f31f6dd0d --- /dev/null +++ b/tests/plugins/core/yaml_file_parser_test.py @@ -0,0 +1,20 @@ +from __future__ import absolute_import +from __future__ import unicode_literals + +from detect_secrets.plugins.core.yaml_file_parser import YamlFileParser +from testing.mocks import mock_file_object + + +class TestYamlFileParser(object): + + def test_get_ignored_lines(self): + content = """keyA: value + keyB: \"another_value\" # pragma: whitelist secret + keyC: yet_another_value + """ + + f = mock_file_object(content) + + ignored_lines = YamlFileParser(f).get_ignored_lines() + + assert ignored_lines == {2} From 8842526658110e2400134f4664f5067f15eb7ac7 Mon Sep 17 00:00:00 2001 From: Louis Trezzini Date: Tue, 10 Jul 2018 14:27:59 +0100 Subject: [PATCH 4/4] Review fixes --- detect_secrets/plugins/core/yaml_file_parser.py | 11 +++++++++++ detect_secrets/plugins/high_entropy_strings.py | 17 +++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/detect_secrets/plugins/core/yaml_file_parser.py b/detect_secrets/plugins/core/yaml_file_parser.py index 4ea38f361..bec927d78 100644 --- a/detect_secrets/plugins/core/yaml_file_parser.py +++ b/detect_secrets/plugins/core/yaml_file_parser.py @@ -113,6 +113,17 @@ def _create_key_value_pair_for_mapping_node_value(key, value, tag): ) def get_ignored_lines(self): + """ + Return a set of integers that refer to line numbers that were + whitelisted by the user and should be ignored. + + We need to parse the file separately from PyYAML parsing because + the parser drops the comments (at least up to version 3.13): + https://github.com/yaml/pyyaml/blob/a2d481b8dbd2b352cb001f07091ccf669227290f/lib3/yaml/scanner.py#L749 + + :return: set + """ + ignored_lines = set() for line_number, line in enumerate(self.content.split('\n'), 1): diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index 3554a6940..4f4d1ec32 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -160,15 +160,16 @@ def _analyze_yaml_file(self, file, filename): item = to_search.pop() try: + if '__line__' in item and not item['__line__'] in ignored_lines: + potential_secrets.update( + self.analyze_string( + item['__value__'], + item['__line__'], + filename, + ), + ) + if '__line__' in item: - if not item['__line__'] in ignored_lines: - potential_secrets.update( - self.analyze_string( - item['__value__'], - item['__line__'], - filename, - ), - ) continue for key in item: