PII with regex and Presidio on 100 python samples, on source code and on extracted docstrings+comments 

In [2]:
from datasets import load_dataset
from bigscience_pii_detect_redact import run_pii_batch
from functools import partial

## Email and IP address detection in the source code files with regex

In [3]:
def get_check_ds(ds, args):
    ds_checks = ds.filter(
        lambda exs: exs["modified"],
        batched=True,
        batch_size=args.batch_size,
        num_proc=args.num_proc
    )
    ds_checks = ds
    #idx_samples = random.sample(range(len(ds_checks)), min(len(ds_checks), args.check_sampling_size))
    # first modified 100 cells
    idx_samples = [i for i in range(100)]
    ds_checks = ds_checks.select(idx_samples)
    return ds_checks

In [4]:
ds = load_dataset("codeparrot/codeparrot-clean-valid", split="train")
ds = ds.select([i for i in range(100)])

Using custom data configuration codeparrot--codeparrot-clean-valid-826c6fd8b27e5523
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


In [None]:
ds_pii = ds.map(
    partial(run_pii_batch),
    batched=True,
    batch_size=10,
    num_proc=12,
    load_from_cache_file=False
)

In [25]:
# indexes of files where PII was detected
[i for i in range(100) if ds_pii["modified"][i]]

[1, 17, 26, 29, 34, 35, 39, 45, 47, 48, 52, 57, 73, 76, 84, 88, 94, 96]

In [None]:
ds_modified = ds_pii.filter(
    lambda exs: exs["modified"],
    batched=True,
    batch_size=10,
    num_proc=12,
    load_from_cache_file=False
)

In [7]:
# 18 files were modified
ds_modified

Dataset({
    features: ['repo_name', 'path', 'copies', 'size', 'content', 'license', 'hash', 'line_mean', 'line_max', 'alpha_frac', 'autogenerated', 'regex_metadata', 'old_text', 'modified'],
    num_rows: 18
})

Some examples

In [21]:
(ds_modified["regex_metadata"][0])

'[(\'127.0.0.1\', (759, 770), \'regex.Regex(\\\'(?:^|[\\\\\\\\b\\\\\\\\s@?,!;:\\\\\\\\\\\\\\\'\\\\\\\\")(.\\\\\\\\p{Han}])((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\\\\\\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\\\\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\\\\\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(?:$|[\\\\\\\\s@,?!;:\\\\\\\'"(.\\\\\\\\p{Han}])\\\'

In [34]:
ds_modified["regex_metadata"][1]

'[(\'<cedric.bellegarde@adishatz.org>\', (44, 76), \'regex.Regex(\\\'([^\\\\\\\\s@,?!;:)(]+@[^,\\\\\\\\s!?;,]{3,}[\\\\\\\\.][^\\\\\\\\s\\\\\\\\b\\\\\\\\\\\\\\\'\\\\\\\\"@,?!;:)(.]+)\\\', flags=regex.M | regex.V0)\', \'EMAIL\')]'

In [35]:
ds_modified["regex_metadata"][2]

'[(\'ivana.mihalek@gmail.com\', (717, 740), \'regex.Regex(\\\'([^\\\\\\\\s@,?!;:)(]+@[^,\\\\\\\\s!?;,]{3,}[\\\\\\\\.][^\\\\\\\\s\\\\\\\\b\\\\\\\\\\\\\\\'\\\\\\\\"@,?!;:)(.]+)\\\', flags=regex.M | regex.V0)\', \'EMAIL\')]'

In [37]:
ds_modified["regex_metadata"][3]

'[(\'/build/debpkg/usr/local/lib/mxcart/my_tilers_tools/\', (1829, 1881), \'regex.Regex(\\\'((?:(?:[A-Za-z]+[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:_]+|[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:]+[A-Za-z]+)){10,})(?:$|[\\\\\\\\b\\\\\\\\s\\\\\\\\p{Han}@?,!;:\\\\\\\\\\\\\\\'\\\\\\\\")(.])\\\', flags=regex.M | regex.V0)\', \'KEY\'), (\'/build/debpkg/usr/local/share/icons/hicolor/48x48/apps/\', (2814, 2870), \'regex.Regex(\\\'((?:(?:[A-Za-z]+[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:_]+|[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:]+[A-Za-z]+)){10,})(?:$|[\\\\\\\\b\\\\\\\\s\\\\\\\\p{Han}@?,!;:\\\\\\\\\\\\\\\'\\\\\\\\")(.])\\\', flags=regex.M | regex.V0)\', \'KEY\'), (\'Icon=/usr/local/share/icons/hicolor/48x48/apps/mxcart\', (3728, 3782), \'regex.Regex(\\\'((?:(?:[A-Za-z]+[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:_]+|[\\\\\\\\p{Nd}\\\\\\\\p{Pd}\\\\\\\\/\\\\\\\\+\\\\\\\\=:]+[A-Za-z]+)){10,})(?:$|[\\\\\\\\b\\\\\\\\s\\\\\\\\p{Han

In [8]:
ds_modified["regex_metadata"][4]

'[(\'andrew.t.bentley@gmail.com\', (720, 746), \'regex.Regex(\\\'([^\\\\\\\\s@,?!;:\\\\\\\\\\\\\\\'=)(]+@[^,\\\\\\\\s!?;,]{3,}[\\\\\\\\.][^\\\\\\\\s\\\\\\\\b\\\\\\\\\\\\\\\'\\\\\\\\"@,?!;:)(.]+)\\\', flags=regex.M | regex.V0)\', \'EMAIL\')]'

In [13]:
print(ds_modified["old_text"][4][600:800])

sion(),
    url='http://github.com/atbentley/plank/',
    license='MIT',
    author='Andrew Bentley',
    author_email='andrew.t.bentley@gmail.com',
    description="A simple task and build runner tha


In [12]:
print(ds_modified["content"][4][600:800])

sion(),
    url='http://github.com/atbentley/plank/',
    license='MIT',
    author='Andrew Bentley',
    author_email='dummy@email.com',
    description="A simple task and build runner that doesn't g


In [15]:
print(ds_modified["regex_metadata"][5])

[('192.168.88.1', (1886, 1900), 'regex.Regex(\'(?:^|[\\\\b\\\\s@?,!;:\\\\\\\'\\\\")(.\\\\p{Han}])((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(?:$|[\\\\s@,?!;:\\\'"(.\\\\p{Han}])\', flags=regex.M | regex.V0)', 'IP_ADDRESS')]


In [23]:
print(ds_modified["old_text"][5][:])

import datetime
import subprocess

from macd.models import SeenEvent, Device
from django.shortcuts import render
from django.utils import timezone

def index(request):
    now = timezone.now()
    time_threshold = now - datetime.timedelta(minutes=10)
    items = SeenEvent.objects.filter(date__gte=time_threshold)
    devices_set = set(item.mac.device for item in items
                      if not item.mac.device.ignored)

    devices = []
    two_minutes = now - datetime.timedelta(minutes=2)
    macdb = open("/usr/share/nmap/nmap-mac-prefixes").readlines()
    for device in devices_set:
        found_2min = False
        earliest_since = None
        macs = device.mac_set.all()
        items_for_mac = SeenEvent.objects.filter(mac__in=macs)[:10000]
        if len(items_for_mac) > 0:
            for i in range(1, len(items_for_mac)):
                curr, previous = items_for_mac[i].date, items_for_mac[i-1].date
                difference = previous - curr
                if earliest_sinc

In [29]:
print(ds_modified["regex_metadata"][9])

[('9bea85795705d015cdadc82c68b99196a8554f5c', (34916, 34957), 'regex.Regex(\'((?:(?:[A-Za-z]+[\\\\p{Nd}\\\\p{Pd}\\\\/\\\\+\\\\=:_]+|[\\\\p{Nd}\\\\p{Pd}\\\\/\\\\+\\\\=:]+[A-Za-z]+)){10,})(?:$|[\\\\b\\\\s\\\\p{Han}@?,!;:\\\\\\\'\\\\")(.])\', flags=regex.M | regex.V0)', 'KEY'), ('super@example.com', (53204, 53221), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('super@example.com', (53387, 53404), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('super@example.com', (55475, 55492), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('//input[@value="Save"]\').click', (175046, 175076), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=rege

In [36]:
#(ds_modified["old_text"][9]).index("super@")
print(ds_modified["old_text"][9][53100:])

f.super_email_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'super@example.com',
            'password': 'secret',
        }
        self.super_email_bad_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'super@example.com',
            'password': 'notsecret',
        }
        self.adduser_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'adduser',
            'password': 'secret',
        }
        self.changeuser_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'changeuser',
            'password': 'secret',
        }
        self.deleteuser_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'deleteuser',
            'password': 'secret',
        }
        self.joepublic_login = {
            REDIRECT_FIELD_NAME: '/test_admin/admin/',
            'username': 'joepublic',
            'p

In [38]:
print(ds_modified["regex_metadata"][6])

[('"aeronaut@pianoguy.de', (319, 340), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('"vuolter@gmail.com', (386, 404), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]


In [39]:
print(ds_modified["regex_metadata"][7])

[('marc@marc-abramowitz.com', (249, 273), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'=)(]+@[^,\\\\s!?;,]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]


In [40]:
print(ds_modified["regex_metadata"][8])

[('192.168.1.2', (809, 822), 'regex.Regex(\'(?:^|[\\\\b\\\\s@?,!;:\\\\\\\'\\\\")(.\\\\p{Han}])((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(?:$|[\\\\s@,?!;:\\\'"(.\\\\p{Han}])\', flags=regex.M | regex.V0)', 'IP_ADDRESS')]


In [11]:
print(ds_modified["regex_metadata"][11])

[('10.254.254.28', (453, 468), 'regex.Regex(\'(?:^|[\\\\b\\\\s@?,!;:\\\\\\\'\\\\")(.\\\\p{Han}])((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]))(?:$|[\\\\s@,?!;:\\\'"(.\\\\p{Han}])\', flags=regex.M | regex.V0)', 'IP_ADDRESS')]


In [16]:
print(ds_modified["old_text"][11][250:])

ort sys
from urllib.request import urlretrieve
from pathlib import Path

"""Logpuzzle exercise
Given an apache logfile, find the puzzle urls and download the images.

Here's what a puzzle url looks like:
10.254.254.28 - - [06/Aug/2007:00:13:48 -0700] "GET /~foo/puzzle-bar-aaab.jpg HTTP/1.0" 302 528 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"
"""


def read_urls(filename):
    """Returns a list of the puzzle urls from the given log file,
    extracting the hostname from the filename itself.
    Screens out duplicate urls and returns the urls sorted into
    increasing order."""

    hostname = 'http://' + filename

    with open(filename) as file:
        log = file.read()

    images_list = re.findall('\S*/images/puzzle\S*', log)
    images = sorted(set(images_list))
    return [hostname + image for image in images]


def download_images(img_urls, dest_dir):
    """Given the urls already in the correct order, downloads
    each image

In [17]:
print(ds_modified["regex_metadata"][12])

[('zoidberg@mujmail.cz', (390, 409), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]


In [18]:
print(ds_modified["regex_metadata"][13])

[('<wackou@gmail.com>', (140, 158), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('0.0.0.0', (14127, 14136), 'regex.Regex(\'(?:^|[\\\\b\\\\s@?,!;:\\\\\\\'\\\\")(.\\\\p{Han}])((?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\\\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}|(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:

In [21]:
(ds_modified["old_text"][13]).index("123.123.123.123")

23249

In [26]:
print(ds_modified["old_text"][13][23000:])

ironment(args.environment)
        print()
        if len(args.args) != 2:
            log.error('You need to specify a deployment config file as argument and a host ip or vps provider')
            log.error('eg: bts deploy_node deploy_config.yaml 123.123.123.123  # use given host for install')
            log.error('eg: bts deploy_node deploy_config.yaml vultr            # create a new vps instance')
            log.info('You can find an example config file at {}'.format(join(dirname(__file__), 'deploy_config.yaml')))
            sys.exit(1)

        config_file = args.args[0]
        host = args.args[1]

        from .deploy import deploy_node  # can only import now due to potential circular import

        deploy_node(args.environment, config_file, host)

    elif args.command in COMMAND_PLUGINS:
        cmd = COMMAND_PLUGINS[args.command]
        if 'env' in inspect.signature(cmd.run_command).parameters:
            cmd.run_command(*args.args, env=args.environment)
        else:
 

In [29]:
print(ds_modified["regex_metadata"][14])
print(ds_modified["regex_metadata"][15])
print(ds_modified["regex_metadata"][16])

[('ngakan.gandhi@packet-systems.com', (53, 85), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]
[('<amix.pal@gmail.com>', (114, 134), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]
[('<sduenas@bitergia.com>', (730, 752), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('<quan@bitergia.com>', (769, 788), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('P<email>[^\\s@]+@[^\\s@.]+\\.[^\\s', (2365, 2395), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\'

In [34]:
print(ds_modified["regex_metadata"][17])

[('<niphlod@gmail.com>', (49, 68), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL'), ('<niphlod@gmail.com>', (983, 1002), 'regex.Regex(\'([^\\\\s@,?!;:\\\\\\\'\\\\"=)(]+@[^,\\\\s!?;,\\\\\\\'\\\\"=]{3,}[\\\\.][^\\\\s\\\\b\\\\\\\'\\\\"@,?!;:)(.]+)\', flags=regex.M | regex.V0)', 'EMAIL')]


# Presidio (Person names detection in the docstrings/comments)

In this section we test Presidio for the detection of the entities ["EMAIL_ADDRESS","PERSON", "CREDIT_CARD", "IP_ADDRESS", "IBAN_CODE"]
* We first apply it on the code files then on their dosctrings/comments only

* Even with a high threshold on prediction score (0.8) the **predictiosn are bad**. The underhood NER must have trouble infering context from technical text
which often include python keywords and code

In [None]:
!pip install presidio-analyzer
!pip install presidio-anonymizer
!python -m spacy download en_core_web_lg

In [9]:
from datasets import load_dataset
import pandas as pd
from text_extraction import get_text
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine

COLUMN="content"
 
def extract_nl_text(example):
        text = get_text(example[COLUMN])
        example["nl_text"] = text
        example["nl_size"] = len(text)
        return example

In [10]:
example = '''
"""
@since Jan 6, 2011
@author Mario Steinhoff
"""

class foo():
    """
    class docstring
    """
    some_value = 2
    def bar(arg1, arg2):
        """ function docstring"""
        if arg1 == 2:
            #first comment
            print(arg1)
        else:
            # second comment
            arg2 += 3
        return arg1 + arg2
'''

print(get_text(example))

class docstring
function docstring
@since Jan 6, 2011
@author Mario Steinhoff

first comment second comment


In [72]:
# dataset for pII
ds = load_dataset("codeparrot/codeparrot-clean-valid", split="train")
ds = ds.select([i for i in range(100)])
ds_text = ds.map(extract_nl_text)

Using custom data configuration codeparrot--codeparrot-clean-valid-826c6fd8b27e5523
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-705f9821443934eb.arrow


In [73]:
ds_text

Dataset({
    features: ['repo_name', 'path', 'copies', 'size', 'content', 'license', 'hash', 'line_mean', 'line_max', 'alpha_frac', 'autogenerated', 'nl_text', 'nl_size'],
    num_rows: 100
})

In [74]:
code_content = ds_text["content"]
code_content_df = pd.DataFrame(code_content, columns=["content"])

In [75]:
# Apply Presidio
print(f"Analyzing {len(code_content_df)} code snippets")
analyzer = AnalyzerEngine()
# Add regexes to analyzer here
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
analyzer_results = batch_analyzer.analyze_dict(
    code_content_df.to_dict(orient="list"),
    language="en",
    entities=["EMAIL_ADDRESS","PERSON", "CREDIT_CARD", "IP_ADDRESS", "IBAN_CODE"],
    score_threshold=0.8
)


Analyzing 100 code snippets


In [76]:
detected_entities_list = []
for result in analyzer_results:
    recognizer_results = result.recognizer_results
    content = result.value
    for recognizer_result, code in zip(recognizer_results, content):
        entities = []
        for entity in recognizer_result:
            entities.append([entity.entity_type, code[entity.start : entity.end], entity.score])
        current_code_entities = [code, len(code), entities]
        detected_entities_list.append(current_code_entities)


Visualization of some examples

In [120]:
for i in range(6):
    print(f"Sample {i}:\n{detected_entities_list[i][-1]}\n")

Sample 0:
[['PERSON', 'DATASET_VIEWS', 0.85], ['PERSON', 'pgresults', 0.85], ['PERSON', 'pgresults', 0.85], ['PERSON', 'has_write_permissions', 0.85]]

Sample 1:
[['PERSON', 'Mikio Hirabayashi', 0.85], ['PERSON', "outlst = [struct.pack('>BBI'", 0.85], ['PERSON', 'fail_code = ord(sockrecv(sock', 0.85], ['PERSON', 'klen', 0.85], ['PERSON', 'maxkeys = len(self', 0.85], ['PERSON', 'self.sock.close', 0.85], ['PERSON', 'bitflag', 0.85], ['PERSON', 't0(C.rnum', 0.85], ['PERSON', 'tcrdbmisc', 0.85], ['PERSON', 'bitflag', 0.85]]

Sample 2:
[['PERSON', 'mignight', 0.85]]

Sample 3:
[['PERSON', 'TraxelsPerFrame[0', 0.85], ['PERSON', 'hypotheses_graph.toTrackingGraph', 0.85]]

Sample 4:
[['PERSON', '轮廓提取模式 Contour_Retrieval_Mode\nimage', 0.85], ['PERSON', 'img', 0.85]]

Sample 5:
[['PERSON', "uses_free\n\n\n@pytest.fixture(scope='module'", 0.85], ['PERSON', 'start_line = test_lnotab_roundtrip.__code__.co_firstlineno + 3', 0.85], ['PERSON', 'CO_FUTURE_ABSOLUTE_IMPORT=True', 0.85], ['PERSON', 'CO_FU

### Apply Presidio on docstrings and comments (mostly english text)

Presidio doesn't work on source code, let's try on docstrings and comments which should mostly be englih text

In [121]:
#example of extracted docstrings + comments
ds_text["nl_text"][0]

'Displays a MX Dataset and associated information.\n\nShows a full (hundreds of images) dataset its metadata and a list\nof associated files with the option to show metadata of each file\nand ways to download those files.  With write permission this page\nalso allows uploading and metadata editing.\n\nSettings for this view:\nINSTALLED_APPS += ("tardis.apps.mx_views",)\nDATASET_VIEWS = [("http://synchrotron.org.au/views/dataset/full",\n                  "tardis.apps.mx_views.views.view_full_dataset"),]\n\n pagination was removed by someone in the interface but not here. need to fix. If page request (9999) is out of range, deliver last page of results. take 4 evenly spaced images from the set'

In [86]:
#select extracted text
code_content = ds_text["nl_text"]
code_content_df = pd.DataFrame(code_content, columns=["nl_text"])

analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
analyzer_results = batch_analyzer.analyze_dict(
    code_content_df.to_dict(orient="list"),
    language="en",
    entities=["EMAIL_ADDRESS","PERSON", "CREDIT_CARD", "IP_ADDRESS", "IBAN_CODE"],
    score_threshold=0.8
)

In [87]:
detected_entities_list2 = []
for result in analyzer_results:
    recognizer_results = result.recognizer_results
    content = result.value
    for recognizer_result, code in zip(recognizer_results, content):
        entities = []
        for entity in recognizer_result:
            if entity.score >= 0.8:
                entities.append([entity.entity_type, code[entity.start : entity.end], entity.score])
        current_code_entities = [code, len(code), entities]
        detected_entities_list2.append(current_code_entities)


In [129]:
for i in range(20):
    if detected_entities_list2[i][-1]:
        print(f"Sample {i}:\n{detected_entities_list2[i][-1]}\n")
    if i == 2:
        print(f"Original text {i}:\n{detected_entities_list2[i][0]}\n")

Sample 0:
[['PERSON', 'DATASET_VIEWS', 0.85]]

Sample 1:
[['PERSON', 'bitflag', 0.85], ['PERSON', 'bitflag', 0.85], ['PERSON', 'Mikio Hirabayashi', 0.85]]

Sample 2:
[['PERSON', 'mignight', 0.85]]

Original text 2:
wrong precision wrong prefix yesterday is always save since we reset to mignight and do <, not <=

Sample 3:
[['PERSON', 'TraxelsPerFrame[0', 0.85], ['PERSON', 'trackingGraph', 0.85], ['PERSON', 'hypotheses_graph.toTrackingGraph', 0.85]]

Sample 4:
[['PERSON', 'type轮廓提取模式 Contour_Retrieval_Mode img = cv2.drawContours(im', 0.85]]

Sample 8:
[['PERSON', 'Luc Saffre', 0.85]]

Sample 9:
[['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85]]

Sample 11:

Sample 14:
[['PERSON', 'Creer', 0.85], ['PERSON', 'Creer', 0.85], ['PERSON', 'Creer', 0.85], ['PERSON', 'Creer', 0.85], ['PERSON', 'Creer', 0.85]]

Sample 15:
[['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85], ['PERSON', 'TODO', 0.85]]

Sample 17:
[['EM

In [108]:
detected_entities_list2[19][-1]

[['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Meresco', 0.85],
 ['PERSON', 'Franklin St', 0.85]]

In [109]:
print(detected_entities_list2[19][0])

begin license   "Meresco Examples" is a project demonstrating some of the features of various components of the "Meresco Suite". Also see http://meresco.org.   Copyright (C) 2007-2008 SURF Foundation. http://www.surf.nl Copyright (C) 2007-2010 Seek You Too (CQ2) http://www.cq2.nl Copyright (C) 2007-2009 Stichting Kennisnet Ict op school. http://www.kennisnetictopschool.nl Copyright (C) 2009 Delft University of Technology http://www.tudelft.nl Copyright (C) 2009 Tilburg University http://www.uvt.nl  This file is part of "Meresco Examples"  "Meresco Examples" is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.  "Meresco Examples" is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General P

In [111]:
print(detected_entities_list2[20][-1])

[['PERSON', 'Args', 0.85], ['PERSON', 'Args', 0.85]]


In [117]:
print(detected_entities_list2[25][-1])
print(detected_entities_list[25][-1])

[['PERSON', 'MAASAPINotFound', 0.85], ['PERSON', 'ExternalAuthInfo', 0.85], ['PERSON', 'json', 0.85], ['PERSON', 'rndc', 0.85]]
[['PERSON', 'ExternalComponentsMiddleware', 0.85], ['PERSON', 'MAASAPINotFound', 0.85], ['PERSON', 'json', 0.85], ['PERSON', 'rndc', 0.85], ['PERSON', 'INTERNAL_SERVER_ERROR', 0.85], ['PERSON', 'logger.error', 0.85], ['PERSON', 'GATEWAY_TIMEOUT', 0.85]]
