In [2]:
import os
import json
import tempfile

from datasets import load_dataset
from detect_secrets import SecretsCollection
from detect_secrets.settings import default_settings, transient_settings

## Run detect-secrets with default settings on 1k samples
We will run detect-secrets with default settings (with all plugins and filters) on the 1K subset of the stack (the original labelling dataset), to see which detector berform well and which don't. Then we will select a subset of plugins and filters

In [4]:
ds = load_dataset("loubnabnl/pii_labeling_dataset", split="train")
ds

Using custom data configuration loubnabnl--pii_labeling_dataset-e7718515568813a1
Found cached dataset parquet (/Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--pii_labeling_dataset-e7718515568813a1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang'],
    num_rows: 1000
})

In [44]:
def scan_str_content(content, suffix=".txt"):
    """Detect secret keys in content
    Args:
        content (str): content to scan
        suffix (str): suffix of the file
    Returns:
        list: list of secrets found"""
    
    fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
    fp.write(content)
    fp.close()
    secrets = SecretsCollection()
    #with transient_settings({'plugins_used': plugins, 'filters_used': filters}) as settings:
    with default_settings() as settings:
        secrets.scan_file(fp.name)
    os.unlink(fp.name)
    secrets_set = list(secrets.data.values())
    result = []
    if secrets_set:
        for secret in secrets_set[0]:
            result.append({
                'type': secret.type,
                'secret_value': secret.secret_value,
                'start_index': content.index(secret.secret_value),
                'end_index': content.index(secret.secret_value) + len(secret.secret_value),
            })
    return result


def scan_secrets_batch(examples):
    """Scan a batch of examples from a dataset for secret keys
    This add two columns to the dataset:
    - pii: (list) of secrets found
    - has_pii: (bool) whether the example contains secret"""

    list_secrets = []
    list_types = []
    list_limits = []
    has_secrets = []
    for text in examples["content"]:
        output = scan_str_content(text, suffix=".txt")
        if  output:
            # get secret values of each element in output
            # to add this in datasets we need same number of samples in each row
            # we save it as str instead of list
            secrets = str([e['secret_value'] for e in output])
            types = str([e['type'] for e in output])
            limits = str([(e['start_index'], e['end_index']) for e in output])
            list_secrets.append(secrets)
            list_types.append(types)
            list_limits.append(limits)
            has_secrets.append(True)
        else:
            list_secrets.append("")
            list_types.append("")
            list_limits.append("")
            has_secrets.append(False)
    return {"secrets": list_secrets, "types": list_types, "has_secrets": has_secrets}


def scan_secrets_batch_viz(examples):
    outputs = []
    for i, text in enumerate(examples["content"]):
        output = scan_str_content(text, suffix=".txt")
        if  output:
            outputs.append({"id": i, "secrets": output})
    return outputs

In [21]:
# example
content = '''[default]
aws_access_key_id=AKIAIOSFODNN7EXAMPLE
aws_secret_access_key=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
aws_session_token=AQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh3c/
sso_account_id = 123456789012
IP http://10.0.0.0:24
'''
scan_str_content(content, '.txt')

[{'type': 'AWS Access Key',
  'secret_value': 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
  'start_index': 71,
  'end_index': 111},
 {'type': 'Base64 High Entropy String',
  'secret_value': 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
  'start_index': 71,
  'end_index': 111},
 {'type': 'AWS Access Key',
  'secret_value': 'AKIAIOSFODNN7EXAMPLE',
  'start_index': 28,
  'end_index': 48},
 {'type': 'Base64 High Entropy String',
  'secret_value': 'AQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh3c/',
  'start_index': 130,
  'end_index': 196},
 {'type': 'Secret Keyword',
  'secret_value': 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY',
  'start_index': 71,
  'end_index': 111}]

In [None]:
# use multiprocessing to scan the dataset, takes few seconds
ds_detect_secrets = ds.map(
    scan_secrets_batch,
    batched=True,
    batch_size=10,
    num_proc=12,
    load_from_cache_file=False
)

In [22]:
ds_detect_secrets

Dataset({
    features: ['content', 'licenses', 'repository_name', 'path', 'size', 'lang', 'secrets', 'types', 'has_secrets'],
    num_rows: 1000
})

In [24]:
# filter on has_secrets true
print("number of samples with secrets: ", len(ds_detect_secrets.filter(lambda x: x['has_secrets'])))
# the secrets are much more

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--pii_labeling_dataset-e7718515568813a1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-23b64a24a2c7f4b7.arrow


number of samples with secrets:  28


In [25]:
print(ds_detect_secrets["secrets"])

['', '', '', '', '', '', '', '', '', '', '', '', "['75e5849b1a27d71e74de1390a4fc81c38b4ed8ce24d4efb2c9a5807d0e82106c']", '', '', '', '', '', '', '', '', '', '', '', '', "['bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d', '56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467']", '', '', '', '', '', '', '', '', "['7ab18906739e4662ac01e69f5ebb7352', 'test']", '', '', '', '', '', '', '', "['vmwin10', 'admin']", "['xx']", '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', "['secret', 'test-secret-key']", '', '', '', '', '', '', '', '', '', "['000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f']", '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', "['PASSWORD']", '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [45]:
result = scan_secrets_batch_viz(ds)

In [46]:
result

[{'id': 12,
  'secrets': [{'type': 'Hex High Entropy String',
    'secret_value': '75e5849b1a27d71e74de1390a4fc81c38b4ed8ce24d4efb2c9a5807d0e82106c',
    'start_index': 2449,
    'end_index': 2513}]},
 {'id': 25,
  'secrets': [{'type': 'Hex High Entropy String',
    'secret_value': '56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467',
    'start_index': 545,
    'end_index': 609},
   {'type': 'Hex High Entropy String',
    'secret_value': 'bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d',
    'start_index': 194,
    'end_index': 258}]},
 {'id': 34,
  'secrets': [{'type': 'Secret Keyword',
    'secret_value': 'test',
    'start_index': 996,
    'end_index': 1000},
   {'type': 'Hex High Entropy String',
    'secret_value': '7ab18906739e4662ac01e69f5ebb7352',
    'start_index': 1168,
    'end_index': 1200}]},
 {'id': 42,
  'secrets': [{'type': 'Secret Keyword',
    'secret_value': 'admin',
    'start_index': 175,
    'end_index': 180},
   {'type': 'Secret Ke

In [37]:
pip install gibberish-detector

Collecting gibberish-detector
  Downloading gibberish_detector-0.1.1-py3-none-any.whl (10 kB)
Installing collected packages: gibberish-detector
Successfully installed gibberish-detector-0.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [41]:
from gibberish_detector import detector

# pip install gibberish-detector
# download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
# run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
Detector = detector.create_from_model('big.model')
print(Detector.is_gibberish('//password'))

False


### Analysis of the results
Looking at the samples it seems like most of them are detected with:
* Hex High Entropy String, some other regex detectors like JSON Web Token, and these work well.
* Many are also detected with Base64 High Entropy String, but this is has some false positives like paths from one file. We will see that the **Gibberish-detector** can filter them out. 
* Then many are detected with the Secret Keyword Detector and most of them aren't secrets, this is expected according to docs since it variable names that are often associated with secrets. 
* There is also one detection with Basic Auth Credentials, but this is a false positive (same was observed in codeparrot data scanning)

=> We will keep Hex High Entropy String, Base64 High Entropy String with Gibberish detector on top, remove Secret Keyword Detector and Basic Auth Credentials. And keep the other regex based detectors, we need to test how much they detect on a larger dataset, we also remove [PrivateKeyDetector](https://github.com/Yelp/detect-secrets/blob/6bf879011cea4d280daee08a89bdc1002fd4fc53/detect_secrets/plugins/private_key.py), it checks for keywords too

Note: Later we try changing  the limit parameter of the BASE64 entropy detector, but it doesn't hep

In [75]:
print(f"total number of detected secrets in {len(result)} files (among 1k) is: {sum([len(e['secrets']) for e in result])}")

total number of detected secrets in 28 files (among 1k) is: 134


In [52]:
print("non gibberish secrets")
for detection in result:
    id = detection['id']
    for d in detection["secrets"]:
        if d["type"] == "Base64 High Entropy String":
            if not Detector.is_gibberish(d["secret_value"]):
                print(f"secret:{d['secret_value']}\n id:{id} start_index:{d['start_index']}\n")
                print(detection["id"])

non gibberish secrets
secret:PHPUnit\\Framework\\MockObject\\SoapExtensionNotAvailableException
 id:723 start_index:557089

723
secret:PHPUnit\\Framework\\MockObject\\ClassIsFinalException
 id:723 start_index:550227

723
secret:PHPUnit\\Framework\\MockObject\\ConfigurableMethodsAlreadyInitializedException
 id:723 start_index:550543

723
secret:PHPUnit\\Framework\\MockObject\\MethodParametersAlreadyConfiguredException
 id:723 start_index:553209

723
secret:PHPUnit\\Framework\\MockObject\\MethodNameAlreadyConfiguredException
 id:723 start_index:552671

723
secret:PHPUnit\\Framework\\MockObject\\ClassAlreadyExistsException
 id:723 start_index:550049

723
secret:PHPUnit\\Framework\\NoChildTestSuiteException
 id:723 start_index:559526

723
secret:PHPUnit\\TextUI\\XmlConfiguration\\UpdateSchemaLocationTo93
 id:723 start_index:580915

723
secret:PHPUnit\\Framework\\InvalidCoversTargetException
 id:723 start_index:547829

723
secret:PHPUnit\\Framework\\MockObject\\Rule\\InvokedAtLeastCount
 id

In [55]:
print(ds[723]["content"][550227-400:550227+400])

otUseAddMethodsException.php',
        'PHPUnit\\Framework\\MockObject\\CannotUseOnlyMethodsException' => __DIR__ . '/..' . '/phpunit/phpunit/src/Framework/MockObject/Exception/CannotUseOnlyMethodsException.php',
        'PHPUnit\\Framework\\MockObject\\ClassAlreadyExistsException' => __DIR__ . '/..' . '/phpunit/phpunit/src/Framework/MockObject/Exception/ClassAlreadyExistsException.php',
        'PHPUnit\\Framework\\MockObject\\ClassIsFinalException' => __DIR__ . '/..' . '/phpunit/phpunit/src/Framework/MockObject/Exception/ClassIsFinalException.php',
        'PHPUnit\\Framework\\MockObject\\ConfigurableMethod' => __DIR__ . '/..' . '/phpunit/phpunit/src/Framework/MockObject/ConfigurableMethod.php',
        'PHPUnit\\Framework\\MockObject\\ConfigurableMethodsAlreadyInitializedException' => _


Conclusion: Gibberish detector can filter the false positives of this detector

In [57]:
print("gibberish secrets")
for detection in result:
    id = detection['id']
    for d in detection["secrets"]:
        if d["type"] == "Base64 High Entropy String":
            if Detector.is_gibberish(d["secret_value"]):
                print(f"secret:{d['secret_value']}\n id:{id} start_index:{d['start_index']}\n")
                print(detection["id"])

gibberish secrets
secret:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=
 id:416 start_index:736691

416
secret:sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=
 id:710 start_index:4091

710
secret:sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM
 id:710 start_index:4509

710
secret:sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T
 id:710 start_index:359

710
secret:sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1
 id:710 start_index:4289

710
secret:sha384-fnmOCqbTlWIlj8LyTjo7mOUStjsKC4pOpQbqyi7RrhN7udi9RwhKkMHpvLbHG9Sr
 id:710 start_index:566

710
secret:sha384-IQsoLXl5PILFhosVNubq5LC7Qb9DXgDA9i+tQ8Zj3iwWAwPtgFTxbJ8NT4GN1R8p
 id:739 start_index:8678

739
secret:sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM
 id:739 start_index:8460

739
secret:sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p
 id:739 start_index:418

739
secret:sha384-gXt9imSW0VcJVH

In [62]:
print(ds[710]["content"][359-300:359+400])

"UTF-8">
    <meta content="width=device-width, initial-scale=1, maximum-scale=1, shrink-to-fit=no" name="viewport">
    <title>Login</title>

    <!-- General CSS Files -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"
        integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.7.2/css/all.css"
        integrity="sha384-fnmOCqbTlWIlj8LyTjo7mOUStjsKC4pOpQbqyi7RrhN7udi9RwhKkMHpvLbHG9Sr" crossorigin="anonymous">

    <!-- CSS Libraries -->
    <link rel="stylesheet" href="../node_modules/bootstrap-social/b


## Detection with selected plugins and filters

In [70]:
secrets = SecretsCollection()
filters = [
    # some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters) 
    # were removed based on their goal
    {'path': 'detect_secrets.filters.heuristic.is_sequential_string'},
    {'path': 'detect_secrets.filters.heuristic.is_potential_uuid'},
    {'path': 'detect_secrets.filters.heuristic.is_likely_id_string'},
    {'path': 'detect_secrets.filters.heuristic.is_templated_secret'},
    {'path': 'detect_secrets.filters.heuristic.is_sequential_string'},
]
plugins = [
        # remove 3 plugins for keyword
        {'name': 'ArtifactoryDetector'},
        {'name': 'AWSKeyDetector'},
        # the entropy detectors esp Base64 need the gibberish detector on top
        {'name': 'Base64HighEntropyString'},
        {'name': 'HexHighEntropyString'},
        {'name': 'AzureStorageKeyDetector'},
        {'name': 'CloudantDetector'},
        {'name': 'DiscordBotTokenDetector'},
        {'name': 'GitHubTokenDetector'},
        {'name': 'IbmCloudIamDetector'},
        {'name': 'IbmCosHmacDetector'},
        {'name': 'JwtTokenDetector'},
        {'name': 'MailchimpDetector'},
        {'name': 'NpmDetector'},
        {'name': 'SendGridDetector'},
        {'name': 'SlackDetector'},
        {'name': 'SoftlayerDetector'},
        {'name': 'StripeDetector'},
        {'name': 'TwilioKeyDetector'},
        #{'name': 'BasicAuthDetector'},
        #{'name': 'KeywordDetector'},
        #{'name': 'PrivateKeyDetector'},
    ]

def scan_str_content(content, suffix=".txt"):
    """Detect secret keys in content with selected plugins and filters
    Args:
        content (str): content to scan
        suffix (str): suffix of the file
    Returns:
        list: list of secrets found"""
    
    fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
    fp.write(content)
    fp.close()
    secrets = SecretsCollection()
    with transient_settings({'plugins_used': plugins, 'filters_used': filters}) as settings:
    #with default_settings() as settings:
        secrets.scan_file(fp.name)
    os.unlink(fp.name)
    secrets_set = list(secrets.data.values())
    result = []
    if secrets_set:
        for secret in secrets_set[0]:
            result.append({
                'type': secret.type,
                'secret_value': secret.secret_value,
                'start_index': content.index(secret.secret_value),
                'end_index': content.index(secret.secret_value) + len(secret.secret_value),
            })
    return result


def scan_secrets_batch(examples):
    """Scan a batch of examples from a dataset for secret keys
    This add two columns to the dataset:
    - pii: (list) of secrets found
    - has_pii: (bool) whether the example contains secret"""

    list_secrets = []
    list_types = []
    list_limits = []
    has_secrets = []
    for text in examples["content"]:
        output = scan_str_content(text, suffix=".txt")
        if  output:
            # get secret values of each element in output
            # to add this in datasets we need same number of samples in each row
            # we save it as str instead of list
            secrets = str([e['secret_value'] for e in output])
            types = str([e['type'] for e in output])
            limits = str([(e['start_index'], e['end_index']) for e in output])
            list_secrets.append(secrets)
            list_types.append(types)
            list_limits.append(limits)
            has_secrets.append(True)
        else:
            list_secrets.append("")
            list_types.append("")
            list_limits.append("")
            has_secrets.append(False)
    return {"secrets": list_secrets, "types": list_types, "has_secrets": has_secrets}


def scan_secrets_batch_viz(examples):
    outputs = []
    for i, text in enumerate(examples["content"]):
        output = scan_str_content(text, suffix=".txt")
        if  output:
            outputs.append({"id": i, "secrets": output})
    return outputs

In [71]:
# use multiprocessing to scan the dataset, takes few seconds
result_custom = scan_secrets_batch_viz(ds)

In [74]:
print(f"total number of detected secrets in {len(result_custom)} files (among 1k) is: {sum([len(e['secrets']) for e in result_custom])}")

total number of detected secrets in 20 files (among 1k) is: 124


this is 8 file sless than before and 10 secrets less

In [72]:
result_custom

[{'id': 12,
  'secrets': [{'type': 'Hex High Entropy String',
    'secret_value': '3800a9169891c0554775b12cbf5d79f6eb50ccb5f95630536a4cecd7a18aa34b',
    'start_index': 2316,
    'end_index': 2380},
   {'type': 'Hex High Entropy String',
    'secret_value': '75e5849b1a27d71e74de1390a4fc81c38b4ed8ce24d4efb2c9a5807d0e82106c',
    'start_index': 2449,
    'end_index': 2513},
   {'type': 'Hex High Entropy String',
    'secret_value': 'be6bbdf83a789fd2b7e5ac8e2954f510e92115bb9e1c84591f6adb4055a3b845',
    'start_index': 2080,
    'end_index': 2144}]},
 {'id': 25,
  'secrets': [{'type': 'Hex High Entropy String',
    'secret_value': 'bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d',
    'start_index': 194,
    'end_index': 258},
   {'type': 'Hex High Entropy String',
    'secret_value': '56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467',
    'start_index': 545,
    'end_index': 609}]},
 {'id': 34,
  'secrets': [{'type': 'Hex High Entropy String',
    'secret_

In [115]:
print(ds[25]["content"][0:194+400])

{
	'repo_type' : 'archive',
	'download_locations' : [
		#UPDATECHECKS: http://fftw.org/download.html
		#{ "url" : "http://fftw.org/fftw-3.3.9.tar.gz", "hashes" : [ { "type" : "sha256", "sum" : "bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d" }, ], },
		#{ "url" : "https://fossies.org/linux/misc/fftw-3.3.9.tar.gz", "hashes" : [ { "type" : "sha256", "sum" : "bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d" }, ], },
		{ "url" : "http://fftw.org/fftw-3.3.10.tar.gz", "hashes" : [ { "type" : "sha256", "sum" : "56c932549852cddcfafdab3820b0200c7742675be92179e59


In [94]:
print("non gibberish secrets")
count, gibberish = 0, 0
for detection in result_custom:
    id = detection['id']
    for d in detection["secrets"]:
        if d["type"] == "Base64 High Entropy String":
            if not Detector.is_gibberish(d["secret_value"]):
                count += 1
            else: gibberish += 1
print(f"Number non gibberish secrets: {count}")
print(f"Number gibberish secrets: {gibberish}")

non gibberish secrets
Number non gibberish secrets: 26
Number gibberish secrets: 13


### Impact of changing the value of limit on Base64 entropy detector

In [109]:
secrets = SecretsCollection()
LIMIT = 5
filters = [
    # some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters) 
    # were removed based on their goal
    {'path': 'detect_secrets.filters.heuristic.is_sequential_string'},
    {'path': 'detect_secrets.filters.heuristic.is_potential_uuid'},
    {'path': 'detect_secrets.filters.heuristic.is_likely_id_string'},
    {'path': 'detect_secrets.filters.heuristic.is_templated_secret'},
    {'path': 'detect_secrets.filters.heuristic.is_sequential_string'},
]
plugins = [
        {'name': 'Base64HighEntropyString',
         'limit': LIMIT},
    ]

def scan_str_content(content, suffix=".txt"):
    """Detect secret keys in content with selected plugins and filters
    Args:
        content (str): content to scan
        suffix (str): suffix of the file
    Returns:
        list: list of secrets found"""
    
    fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
    fp.write(content)
    fp.close()
    secrets = SecretsCollection()
    with transient_settings({'plugins_used': plugins, 'filters_used': filters}) as settings:
    #with default_settings() as settings:
        secrets.scan_file(fp.name)
    os.unlink(fp.name)
    secrets_set = list(secrets.data.values())
    result = []
    if secrets_set:
        for secret in secrets_set[0]:
            result.append({
                'type': secret.type,
                'secret_value': secret.secret_value,
                'start_index': content.index(secret.secret_value),
                'end_index': content.index(secret.secret_value) + len(secret.secret_value),
            })
    return result


def scan_secrets_batch(examples):
    """Scan a batch of examples from a dataset for secret keys
    This add two columns to the dataset:
    - pii: (list) of secrets found
    - has_pii: (bool) whether the example contains secret"""

    list_secrets = []
    list_types = []
    list_limits = []
    has_secrets = []
    for text in examples["content"]:
        output = scan_str_content(text, suffix=".txt")
        if  output:
            # get secret values of each element in output
            # to add this in datasets we need same number of samples in each row
            # we save it as str instead of list
            secrets = str([e['secret_value'] for e in output])
            types = str([e['type'] for e in output])
            limits = str([(e['start_index'], e['end_index']) for e in output])
            list_secrets.append(secrets)
            list_types.append(types)
            list_limits.append(limits)
            has_secrets.append(True)
        else:
            list_secrets.append("")
            list_types.append("")
            list_limits.append("")
            has_secrets.append(False)
    return {"secrets": list_secrets, "types": list_types, "has_secrets": has_secrets}


def scan_secrets_batch_viz(examples):
    outputs = []
    for i, text in enumerate(examples["content"]):
        output = scan_str_content(text, suffix=".txt")
        if  output:
            outputs.append({"id": i, "secrets": output})
    return outputs

In [110]:
# use multiprocessing to scan the dataset, takes few seconds
result_custom2 = scan_secrets_batch_viz(ds)

In [111]:
ount, gibberish = 0, 0
for detection in result_custom2:
    id = detection['id']
    for d in detection["secrets"]:
        if d["type"] == "Base64 High Entropy String":
            if not Detector.is_gibberish(d["secret_value"]):
                count += 1
            else: gibberish += 1
print(f"Number non gibberish secrets: {count}")
print(f"Number gibberish secrets: {gibberish}")

Number non gibberish secrets: 2505
Number gibberish secrets: 42


Results of detection for different limits:

limit=5.5
Number non gibberish secrets: 26
Number gibberish secrets: 2

limit=5 (default)
Number non gibberish secrets: 26
Number gibberish secrets: 12

limit=4.5
Number non gibberish secrets: 52
Number gibberish secrets: 13

limit=4
Number non gibberish secrets: 2505
Number gibberish secrets: 42

=> changing limit doesn't help, let's just use the gibberish detector on top then