### Filter predictions from `bigcode/pseudo-labeled-python-data-pii-detection` to create dataset for pretraining

In [1]:
from datasets import load_dataset, load_metric, Dataset, DatasetDict, load_from_disk
from huggingface_hub import notebook_login
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import itertools
from tqdm.notebook import tqdm
sns.set_style()
sns.set_theme()

In [2]:
TAG_TRIGGER_WORDS = {
    'PASSWORD':['auth','password','passphrase','passwd','pass','pwd', 'token'],
    'KEY': ['access', 'accesskey', 'accesstoken', 'account', 'api',  'apikey',
            'apisecret', 'apitoken', 'auth', 'authcode', 'authorization', 'bitcoin',
            'eth', 'hidden', 'iamauthenticator', 'keid', 'key', 'ledger', 'private',
            'pub', 'public', 'secret','sec', 'token', 'verification'],
    'NAME': ['author','copyright', 'created', 'createdby', 'edited', 'editedby', 'editor',
             'maintainer', 'modified', 'modifiedby', 'name', 'requested','requestedby',
             'todo', 'user', 'written', 'writtenby'],
    'USERNAME' : ['author', 'editor', 'facebook', 'github', 'linkedin', 'login',
                  'maintainer', 'todo', 'twitter', 'user', 'username']
}

In [3]:
import os

from spacy import displacy
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForTokenClassification, AutoTokenizer
import re
from seqeval.metrics.sequence_labeling import get_entities
from spacy import displacy
import spacy
import string
from nltk import RegexpTokenizer
from collections import defaultdict


regtok = RegexpTokenizer(r'[\w+\.\-]+|[\S+]')

def map_spans(new_spans, old_spans):
    new_cursor = enumerate(span[-1] for span in new_spans)
    old_cursor = enumerate(span[-1] for span in old_spans)

    i,j = 0, 0
    curr_new = curr_old = (0,0)
    mapping = defaultdict(list)

    while (j < len(new_spans)) or (i < len(old_spans)):

        if curr_new < curr_old:
            try:
                j, curr_new = next(new_cursor)
            except StopIteration:
                j = len(new_spans)
        elif curr_new > curr_old:
            try:
                i, curr_old = next(old_cursor)
            except StopIteration:
                i = len(old_spans)
        else:
            try:
                j, curr_new = next(new_cursor)
            except StopIteration:
                j = len(new_spans)

            try:
                i, curr_old = next(old_cursor)
            except StopIteration:
                i = len(old_spans)

        if (j < len(new_spans)) and (i < len(old_spans)):
            mapping[j].append(i)
            
    return mapping
    
    
def remap_logits(new_spans, old_spans, old_logits):
    mapping = map_spans(new_spans, old_spans)
    mapping_iter = [mapping[i] for i in range(len(mapping))]
    new_logits = [np.mean([old_logits[j] for j in indices], axis=0) for indices in mapping_iter]
    return np.array(new_logits)

def retokenize_with_probas(example):
    new_example = dict(**example)
    new_spans = list(regtok.span_tokenize(example['content']))
    new_example.update(offset_mapping=new_spans,
                       pred=remap_logits(new_spans, new_example.pop('offset_mapping'), new_example['pred']))
    return new_example



In [4]:
dataset = load_dataset('bigcode/pseudo-labeled-python-data-pii-detection',  use_auth_token=True)

In [5]:
dataset = dataset.map(lambda x: dict(predicted_pii=json.loads(x['predicted_pii'])), num_proc=16)
dataset = dataset.map(lambda x: dict(pii=json.loads(x['secrets'])), num_proc=16)

                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3614250cff452193.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-99f4e19d3d391ae2.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-1df9c1861348df58.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-15c3313606f25502.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-8140acec6d863c66.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-64909e7711f638a7.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b7fca41ce6db4148.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-719439d479ffbaa4.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-2166d08b5d6eda33.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-30008e0c10019fee.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-d381b3fb70da4a12.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b6df92f4897de9b2.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-5115453a6171db62.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7751c278e45018d0.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-1ecc9231784b8b49.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7422e2bbf368894f.arrow


                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-effbb4b62dfc39a7.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3f9b32e43979b60d.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-8998a57fd60954e7.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-482660f039f77db5.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-023a1c4e48545600.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-e2a305b62ddee191.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3f870f098db56663.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-03336e407e8393af.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-f532df5fb9612086.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-d4871f059aed2ba2.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-024aca23ff81c3d6.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-14bd044faee7965b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-f4fcd10936b1750c.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7a11346fd44322c9.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-a3067570fb6c0430.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c5361c06d6c51d00.arrow


In [6]:
def check_triggers_in_prefix(entity, content, trigger_words):
    prefix = content[max(entity['start']-100,0):entity['start']]
    prefix = prefix.split('\n')[-1]
    prefix = re.sub('[\s\'":=(){}\[\]_/]','',prefix).lower()

    get_distance = lambda prefix, word: (len(prefix) - (prefix.find(word)+len(word))) if prefix.find(word) > -1 else 100
    distance = min([get_distance(prefix, w) for w in trigger_words])
    return distance


def correct_mislabeling(entry):
    content, predicted_pii = entry['content'], entry['predicted_pii']

    for entity in predicted_pii:
        if entity['tag'] in ['PASSWORD','KEY']:
            pwd_score = check_triggers_in_prefix(entity, content, TAG_TRIGGER_WORDS['PASSWORD']) 
            key_score = check_triggers_in_prefix(entity, content, TAG_TRIGGER_WORDS['KEY'])
            entity['tag'] = 'PASSWORD' if pwd_score <= key_score else 'KEY'
                
    return dict(predicted_pii=predicted_pii)

In [7]:
dataset = dataset.map(correct_mislabeling, num_proc=16)

                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7cdb7e8ce7a404fe.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ff4d4707114bb939.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-80610a87a821bd53.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-2ebcbfb14d9b7716.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-bfb9ec2e7c1f3469.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-d347bbcd22a81b29.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-56d3e2eeef744327.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3f9ddf5ffbcc76f8.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ce241df78bc855fb.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ffed85c56749d1db.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b770c219cc15f613.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c227c98c8db1f8fc.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-915a7ae0e9adfc17.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-6a590e02cfbc6581.arrow


  

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b15754b0c4e30a7c.arrow
Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-0c0720938fe202ec.arrow


In [8]:
def label_by_triggers(entry, max_char_offset = 5):
    content, predicted_pii = entry['content'], entry['predicted_pii']

    for entity in predicted_pii:
        if entity['tag'] in TAG_TRIGGER_WORDS:
            if check_triggers_in_prefix(entity, content, TAG_TRIGGER_WORDS[entity['tag']]) <= max_char_offset:
                entity['label'] = 'GOOD'
                
    return dict(predicted_pii=predicted_pii)


In [9]:
dataset = dataset.map(label_by_triggers, num_proc=16)

                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-90f2e8b74aa6230f.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-f89566d086fef8fe.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-4f66579cf4c9e624.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-0aef2ed0278a6f90.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-06cfc58ef0b1747e.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-565448bdf2b282ab.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-92a2842b56b86064.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-6c1854a35726ba55.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-563095e0aaf1d229.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-27e73afc931045fc.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-90dcda8a451b3d3d.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-aab69261da9641e5.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-77222caecea65c23.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ff1e346bafc2eb74.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-e450e6d8456d3c95.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3761558327b34872.arrow


In [10]:
thresholds_for_good = {
    "PASSWORD":1.0,
    "KEY":1.0,
    "NAME":0.7,
    "USERNAME":0.7,
    "EMAIL":0.9,
    "IP_ADDRESS":1.0,
    'AMBIGUOUS':1.0,
}

thresholds_for_bad ={
    "PASSWORD":0.0,
    "KEY":0.0,
    "NAME":0.6,
    "USERNAME": 0.61,
    "EMAIL":0.0,
    "IP_ADDRESS":0.0,
    "AMBIGUOUS":0.0,
}

def threshold_good(entry):
    predicted_pii = entry['predicted_pii']
    for entity in predicted_pii:
        entity['label'] = "GOOD" if thresholds_for_good[entity['tag']] <= entity['confidence'] else entity['label']
    return dict(predicted_pii=predicted_pii)
        
    
def threshold_bad(entry):
    predicted_pii = entry['predicted_pii']
    for entity in predicted_pii:
        if entity['label']!='GOOD':
            entity['label'] = "BAD" if thresholds_for_bad[entity['tag']] > entity['confidence'] else entity['label']
    return dict(predicted_pii=predicted_pii)
        

dataset = dataset.map(threshold_good, num_proc=16) 
dataset = dataset.map(threshold_bad, num_proc=16)  

                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-839438f37e374a4c.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-1f7cd8a0a51f5b3b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-83dcd939b80d3a10.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b3433b7c046a5ae6.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-5adf588fe059a1cd.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-9b5e215dae569f53.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-427ffafa60185f4f.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-33c958119ab415dc.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-6de61423f214554e.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-058daeb320c915f4.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c7a582b43a9c9495.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-041c2c89a6b04e79.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-241421932e96cc14.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-3e1e3735fc376f42.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-cbee6e170dbfa5ac.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-61030ff82e818a7a.arrow


                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-b65cb9e068b8fd4b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-2cbf182475ebf69e.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7868b72fc939a9c3.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-a7f018f730a48751.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-6e49ee6bfd97ae07.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-5d6bf312b3e20f95.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-caaccf6e73f153f0.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c43dcf3f75717dcb.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-8e278125ee67d3d4.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-0e671802d3d1643d.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c48ed5318ce4e282.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-a3767841889dc615.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-20bb0d6bd8257997.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-5507a5191688df97.arrow


  

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-fdd339b8fb6c32fd.arrow
Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-8062a300bd1cdcc9.arrow


In [11]:
# Discard wrongly detected KEY and PASSWORD

filtered_dataset = dataset.map(lambda x: dict(
    predicted_pii=[e for e in x['predicted_pii'] \
                   if (e['tag'] not in ['KEY','PASSWORD','IP_ADDRESS','EMAIL']) or (e['label'] == 'GOOD')]
    ), num_proc=16)

                 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-1028021f2dc9f9e8.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ec297d103aeab537.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7191ce9401b7146b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-a93ac2d90445f41b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c42d421b0c02b5f5.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-e4de146397322288.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-082f9244aa0270a9.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-7cc2b62d4ee9e688.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-ec246bec4a761891.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-d28e74c3a5eb7f1a.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-f6e261fb03da8c3b.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-c3988f638a96f1be.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-0fef6571aab8ada4.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-d74c4af36b58c17e.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-02afaf299f39527e.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-8f3dc526b6ed6601.arrow


In [12]:
filtered_dataset = filtered_dataset.filter(lambda x: not any([e['label']=='BAD' for e in x['predicted_pii']]), num_proc=16)
filtered_dataset

 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00000_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00001_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00002_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00003_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00004_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00005_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00006_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00007_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00008_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00009_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00010_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00011_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00012_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00013_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00014_of_00016.arrow


 

Loading cached processed dataset at /data3/monty/datasets/ensemble-labeled-python-data-pii-detection-final/cache-baad9bfb48b36f31_00015_of_00016.arrow


Dataset({
    features: ['lang', 'content', 'secrets', 'id', 'predicted_pii', 'pii'],
    num_rows: 37985
})

In [13]:
def _is_in(border, span):
    return min(*span) <= border < max(*span)

def is_overlap(span, reference_span):
    return _is_in(min(*span), reference_span) or \
           _is_in(max(*span), reference_span) or \
           _is_in(min(*reference_span), span) or \
           _is_in(min(*reference_span), span) 
    
    
def _exclude_overlaps(spans, ref_spans):
    return [span for span in spans if not any([is_overlap(span, ref) for ref in ref_spans])]


def merge_predicted_to_pii(entry):
    pii = entry['pii']
    pii_spans = [(e['start'], e['end']) for e in pii]
    predicted_pii=[ent for ent in entry['predicted_pii']
                       if not any([is_overlap((ent['start'],ent['end']), ref) for ref in pii_spans])]
    
    for e in pii:
        e['detected_by'] = 'regex'
        e['confidence'] = 1.0
    for e in predicted_pii:
        e['detected_by'] = 'model'
        e.pop('label')
    
    return dict(pii = pii + predicted_pii)

In [14]:
filtered_dataset = filtered_dataset.map(merge_predicted_to_pii, num_proc=16)

                  

#0:   0%|          | 0/2375 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/2374 [00:00<?, ?ex/s]

#7:   0%|          | 0/2374 [00:00<?, ?ex/s]

  

#8:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#9:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#10:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#11:   0%|          | 0/2374 [00:00<?, ?ex/s]

#12:   0%|          | 0/2374 [00:00<?, ?ex/s]

  

#14:   0%|          | 0/2374 [00:00<?, ?ex/s]

#13:   0%|          | 0/2374 [00:00<?, ?ex/s]

 

#15:   0%|          | 0/2374 [00:00<?, ?ex/s]

In [15]:
filtered_dataset.remove_columns(['predicted_pii'])\
    .map(lambda x: dict(pii=json.dumps(x['pii'])))\
    .save_to_disk('ensemble-labeled-python-data-pii-detection-filtered')

  0%|          | 0/37985 [00:00<?, ?ex/s]