In [1]:
import json
from datasets import load_dataset

from detect_pii import scan_pii_batch_viz, scan_pii_batch
from utils.evaluation import evaluate_pii, evaluate_pii_ds, recall_precision_all_tags, recall_precision

ds = load_dataset("bigcode/pii-for-code", use_auth_token=True, split="train")

Using custom data configuration bigcode--pii-for-code-99989b1cd3de6b7b
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--pii-for-code-99989b1cd3de6b7b/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


In [2]:
ds

Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified'],
    num_rows: 400
})

In [None]:
r = """(?<=^|[\b\s@,?!;:)('".\p{Han}<])([^\b\s@?!;,:)('"<]+@[^\b\s@!?;,/]*[^\b\s@?!;,/:)('">.]\.\p{L}\w{1,})(?=$|[\b\s@,?!;:)('".\p{Han}>])"""

## Run PII detection on the dataset

### 1. Use Regexes for all the PII types

In [2]:
examples = ds.select(range(100))

In [3]:
ds_pii = examples.map(scan_pii_batch, batched=True, batch_size=10)

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--pii-for-code-99989b1cd3de6b7b/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-0fffc14d2ecdef1b.arrow


In [20]:
# small test
def load_json(sample):
    try:
        return json.loads(sample)
    except ValueError:
        return []

for i in range(2):
    print(f"\nSample {i}:\n")

    pii = load_json(ds_pii["pii_modified"][i])
    secrets = load_json(ds_pii["secrets"][i])
    print(f"pii predictins: {secrets}")
    print(f"pii references: {pii}")

    metrics = evaluate_pii(pii, secrets)
    print(metrics)
    print(recall_precision(metrics))
    print(recall_precision_all_tags(metrics))


Sample 0:

pii predictins: [{'tag': 'EMAIL', 'value': 'davis@dlib.net', 'start': 38, 'end': 52}]
pii references: [{'tag': 'EMAIL', 'value': 'davis@dlib.net', 'start': 38, 'end': 52, 'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boost Software Lice'}, {'tag': 'NAME', 'value': 'Miguel Grinberg', 'start': 55, 'end': 70, 'context': 'pyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boost Software License   See LICENSE.'}, {'tag': 'NAME', 'value': 'Davis E. King', 'start': 23, 'end': 36, 'context': '// Copyright (C) 2003  Davis E. King (davis@dlib.net), Miguel Grinberg\n// License: Boo'}]
{'EMAIL': {'TP': 1, 'FN': 0, 'FP': 0}, 'IP_ADDRESS': {'TP': 0, 'FN': 0, 'FP': 0}, 'KEY': {'TP': 0, 'FN': 0, 'FP': 0}}
{'EMAIL': {'recall': 1.0, 'precision': 1.0}, 'IP_ADDRESS': {'recall': 1.0, 'precision': 1.0}, 'KEY': {'recall': 1.0, 'precision': 1.0}}
{'recall': 1.0, 'precision': 1.0}

Sample 1:

pii predictins: [{'tag': 'KEY

In [14]:
# evaluate on the whole 100 samples
metrics, metrics_dict = evaluate_pii_ds(ds_pii, pred_column='secrets', ref_column="pii_modified", overall_score=False, alpha=0.3, beta=0.3)

In [11]:
metrics

{'EMAIL': {'recall': 0.9791666666666666, 'precision': 0.17735849056603772},
 'IP_ADDRESS': {'recall': 0.9523809523809523,
  'precision': 0.47619047619047616},
 'KEY': {'recall': 0.11764705882352941, 'precision': 0.03278688524590164}}

In [12]:
metrics_dict

{'EMAIL': {'TP': 47, 'FN': 1, 'FP': 218},
 'IP_ADDRESS': {'TP': 20, 'FN': 1, 'FP': 22},
 'KEY': {'TP': 2, 'FN': 15, 'FP': 59}}

Why different results below for keys?

In [23]:
references, predictions = [], []
for elem in ds_pii:
    # get list of all detections that have tag="KEY" in columns pii_modified and secrets
    refs = load_json(elem["pii_modified"])
    preds = load_json(elem["secrets"])
    references += [e for e in refs if e["tag"] == "KEY"]
    predictions += [e for e in preds if e["tag"] == "KEY"]
    # evaluate
metrics = evaluate_pii(references, predictions, alpha=0.4, beta=0.4)
print(metrics)
print(recall_precision(metrics))

{'EMAIL': {'TP': 0, 'FN': 0, 'FP': 0}, 'IP_ADDRESS': {'TP': 0, 'FN': 0, 'FP': 0}, 'KEY': {'TP': 7, 'FN': 10, 'FP': 54}}
{'EMAIL': {'recall': 1.0, 'precision': 1.0}, 'IP_ADDRESS': {'recall': 1.0, 'precision': 1.0}, 'KEY': {'recall': 0.4117647058823529, 'precision': 0.11475409836065574}}


### 2. Use detect-secrets to detect keys

In [24]:
ds_pii_2 = examples.map(lambda x: scan_pii_batch(x, key_detector="detect-secrets"), batched=True, batch_size=10)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [27]:
# examples of keys detections
ds_pii_2["secrets"][27]

'[{"tag": "IP_ADDRESS", "value": "0.0.0.0", "start": 163, "end": 170}, {"tag": "IP_ADDRESS", "value": "162.243.201.175", "start": 571, "end": 586}, {"tag": "KEY", "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjU5ODFmMTY3MjEyYjM0OGFlZDdmYTlmNSIsInNjb3BlIjpbImFkbWluIiwiZXZlbnRfbWFuYWdlciIsImV2ZW50X2xvZ2dlciIsImV2ZW50X3dhdGNoZXIiXSwiaWF0IjoxNTI1MDE0NDE3fQ.", "start": 276, "end": 468}, {"tag": "KEY", "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjU5ODFmMTY3MjEyYjM0OGFlZDdmYTlmNSIsInNjb3BlIjpbImFkbWluIiwiZXZlbnRfbWFuYWdlciIsImV2ZW50X2xvZ2dlciIsImV2ZW50X3dhdGNoZXIiXSwiaWF0IjoxNTI1MDEzNTUxfQ.", "start": 694, "end": 886}]'

In [28]:
# evaluate on the whole 100 samples
metrics, metrics_dict = evaluate_pii_ds(ds_pii_2, pred_column='secrets', ref_column="pii_modified", overall_score=False, alpha=0.4, beta=0.4)

In [29]:
metrics

{'EMAIL': {'recall': 0.9791666666666666, 'precision': 0.17735849056603772},
 'IP_ADDRESS': {'recall': 0.9523809523809523,
  'precision': 0.47619047619047616},
 'KEY': {'recall': 0.4117647058823529, 'precision': 0.15555555555555556}}

In [30]:
metrics_dict

{'EMAIL': {'TP': 47, 'FN': 1, 'FP': 218},
 'IP_ADDRESS': {'TP': 20, 'FN': 1, 'FP': 22},
 'KEY': {'TP': 7, 'FN': 10, 'FP': 38}}