From e5e0b3cde56434a708e7b0160f4dc0e2e77d4aac Mon Sep 17 00:00:00 2001 From: Victor Zhou Date: Thu, 22 Aug 2019 16:08:01 -0700 Subject: [PATCH] Detect YAML binary secrets This implements support for high-entropy secrets in binary values in yaml files. We encode the binary value into a hex- or base64-encoded string (based on the plugin), and run the normal entropy check. If the string is deemed to be high-entropy, we re encode the string into a yaml binary (using `yaml.dump`) and strip the `!!binary`. This yaml binary is considered the secret, and is put into the baseline as normal. I had to update a test function so that it uses a custom hex high-entropy detector, since `HighEntropyStringsPlugin` is now an abstract class. --- detect_secrets/core/potential_secret.py | 15 ++-- .../plugins/common/yaml_file_parser.py | 2 +- .../plugins/high_entropy_strings.py | 68 +++++++++++++++++-- test_data/config.yaml | 2 + test_data/config2.yaml | 2 + tests/plugins/common/yaml_file_parser_test.py | 2 +- tests/plugins/high_entropy_strings_test.py | 37 ++++++++-- 7 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 test_data/config2.yaml diff --git a/detect_secrets/core/potential_secret.py b/detect_secrets/core/potential_secret.py index d25637418..e7e8253f5 100644 --- a/detect_secrets/core/potential_secret.py +++ b/detect_secrets/core/potential_secret.py @@ -47,10 +47,18 @@ def __init__( self.type = typ self.filename = filename self.lineno = lineno - self.secret_hash = self.hash_secret(secret) + self.set_secret(secret) self.is_secret = is_secret self.is_verified = False + # If two PotentialSecrets have the same values for these fields, + # they are considered equal. Note that line numbers aren't included + # in this, because line numbers are subject to change. + self.fields_to_compare = ['filename', 'secret_hash', 'type'] + + def set_secret(self, secret): + self.secret_hash = self.hash_secret(secret) + # NOTE: Originally, we never wanted to keep the secret value in memory, # after finding it in the codebase. However, to support verifiable # secrets (and avoid the pain of re-scanning again), we need to @@ -61,11 +69,6 @@ def __init__( # in the repository. self.secret_value = secret - # If two PotentialSecrets have the same values for these fields, - # they are considered equal. Note that line numbers aren't included - # in this, because line numbers are subject to change. - self.fields_to_compare = ['filename', 'secret_hash', 'type'] - @staticmethod def hash_secret(secret): """This offers a way to coherently test this class, diff --git a/detect_secrets/plugins/common/yaml_file_parser.py b/detect_secrets/plugins/common/yaml_file_parser.py index 9453bdd5b..69339afb3 100644 --- a/detect_secrets/plugins/common/yaml_file_parser.py +++ b/detect_secrets/plugins/common/yaml_file_parser.py @@ -88,7 +88,7 @@ def _tag_dict_values(self, map_node): self._create_key_value_pair_for_mapping_node_value( key='__value__', value=value.value, - tag='tag:yaml.org,2002:str', + tag=value.tag, ), self._create_key_value_pair_for_mapping_node_value( key='__line__', diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index e5d692906..25f65d621 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -4,11 +4,13 @@ from backports import configparser except ImportError: # pragma: no cover import configparser +import base64 import math import os import re import string from abc import ABCMeta +from abc import abstractmethod from contextlib import contextmanager import yaml @@ -206,14 +208,23 @@ def _analyze_yaml_file(self, file, filename): try: if '__line__' in item and item['__line__'] not in ignored_lines: - potential_secrets.update( - self.analyze_string( - item['__value__'], - item['__line__'], - filename, - ), + # An isinstance check doesn't work in py2 + # so we need the __is_binary__ field. + string_to_scan = self.decode_binary(item['__value__']) \ + if item['__is_binary__'] \ + else item['__value__'] + + secrets = self.analyze_string( + string_to_scan, + item['__line__'], + filename, ) + if item['__is_binary__']: + secrets = self._encode_yaml_binary_secrets(secrets) + + potential_secrets.update(secrets) + if '__line__' in item: continue @@ -226,6 +237,39 @@ def _analyze_yaml_file(self, file, filename): return potential_secrets + def _encode_yaml_binary_secrets(self, secrets): + result = {} + """The secrets dict format is + `{PotentialSecret: PotentialSecret}`, where both key and + value are the same object. Therefore, we can just mutate + the potential secret once. + """ + for potential_secret in secrets.keys(): + secret_in_yaml_format = yaml.dump( + self.encode_to_binary(potential_secret.secret_value), + ).replace( + '!!binary ', + '', + ) + + potential_secret.set_secret(secret_in_yaml_format) + + result[potential_secret] = potential_secret + + return result + + @abstractmethod + def decode_binary(self, bytes_object): # pragma: no cover + """Converts the bytes to a string which can be checked for + high entropy.""" + pass + + @abstractmethod + def encode_to_binary(self, string): # pragma: no cover + """Converts a string (usually a high-entropy secret) to + binary. Usually the inverse of decode_binary.""" + pass + class HexHighEntropyString(HighEntropyStringsPlugin): """HighEntropyStringsPlugin for hex encoded strings""" @@ -278,6 +322,12 @@ def calculate_shannon_entropy(self, data): return entropy + def decode_binary(self, bytes_object): + return bytes_object.decode('utf-8') + + def encode_to_binary(self, string): + return string.encode('utf-8') + class Base64HighEntropyString(HighEntropyStringsPlugin): """HighEntropyStringsPlugin for base64 encoded strings""" @@ -299,3 +349,9 @@ def __dict__(self): }) return output + + def decode_binary(self, bytes_object): + return base64.b64encode(bytes_object).decode('utf-8') + + def encode_to_binary(self, string): + return base64.b64decode(string) diff --git a/test_data/config.yaml b/test_data/config.yaml index 26ab72ce3..7881662fb 100644 --- a/test_data/config.yaml +++ b/test_data/config.yaml @@ -11,3 +11,5 @@ list_of_keys: - 234567890a test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: allowlist secret + +high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg== diff --git a/test_data/config2.yaml b/test_data/config2.yaml new file mode 100644 index 000000000..00d97250a --- /dev/null +++ b/test_data/config2.yaml @@ -0,0 +1,2 @@ +# This is yaml.dump('2b00042f7481c7b056c4b410d28f33cf'.encode('utf-8')) +high_entropy_hex_binary_secret: !!binary MmIwMDA0MmY3NDgxYzdiMDU2YzRiNDEwZDI4ZjMzY2Y= diff --git a/tests/plugins/common/yaml_file_parser_test.py b/tests/plugins/common/yaml_file_parser_test.py index 0258bea45..b9ff9512d 100644 --- a/tests/plugins/common/yaml_file_parser_test.py +++ b/tests/plugins/common/yaml_file_parser_test.py @@ -27,7 +27,7 @@ def test_get_ignored_lines(self): ['yaml_value', 'expected_value', 'expected_is_binary'], [ ('string_value', 'string_value', False), - ('!!binary YWJjZGVm', 'YWJjZGVm', True), + ('!!binary YWJjZGVm', b'abcdef', True), ], ) def test_possible_secret_format( diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 9c9b72c7e..97dffc4ea 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals import codecs -import string import pytest from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString -from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin from testing.mocks import mock_file_object @@ -213,12 +211,13 @@ def test_yaml_file(self): with open('test_data/config.yaml') as f: secrets = plugin.analyze(f, 'test_data/config.yaml') - assert len(secrets.values()) == 2 + assert len(secrets.values()) == 3 for secret in secrets.values(): location = str(secret).splitlines()[1] assert location in ( 'Location: test_data/config.yaml:3', 'Location: test_data/config.yaml:6', + 'Location: test_data/config.yaml:15', ) def test_env_file(self): @@ -234,6 +233,21 @@ def test_env_file(self): ) +class HexHighEntropyStringsWithStandardEntropy(HexHighEntropyString): + """Copies the HexHighEntropyString class, but keeps the standard + Shannon entropy calculation. + """ + + def __init__(self, *args, **kwargs): + super(HexHighEntropyStringsWithStandardEntropy, self).__init__( + *args, + **kwargs + ) + + def calculate_shannon_entropy(self, data): + return super(HexHighEntropyString, self).calculate_shannon_entropy(data) + + class TestHexHighEntropyStrings(HighEntropyStringsTest): def setup(self): @@ -247,10 +261,21 @@ def setup(self): secret_string='2b00042f7481c7b056c4b410d28f33cf', ) + def test_yaml_file(self): + plugin = HexHighEntropyString(3) + with open('test_data/config2.yaml') as f: + secrets = plugin.analyze(f, 'test_data/config2.yaml') + + assert len(secrets.values()) == 1 + for secret in secrets.values(): + location = str(secret).splitlines()[1] + assert location in ( + 'Location: test_data/config2.yaml:2', + ) + def test_discounts_when_all_numbers(self): - original_scanner = HighEntropyStringsPlugin( - charset=string.hexdigits, - limit=3, + original_scanner = HexHighEntropyStringsWithStandardEntropy( + hex_limit=3, exclude_lines_regex=None, )