In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
from data.data_utils import get_offset_mapping,clean_text
from data.dataset import FeedbackDataset

In [4]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [32]:
from datasets import load_dataset
dataset = load_dataset("ai4privacy/pii-masking-200k")

In [33]:
df = pd.DataFrame(dataset['train'])#[['unmasked_text','privacy_mask']]
df.shape

(209261, 6)

In [34]:
df.head()

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, 'O'], [57, 75, 'PHONEIMEI_1'], [75, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, ', s, assessment, was, found, on,..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, 'O'], [5, 9, 'FIRSTNAME_1'], [9, 44, '...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, 'FIRSTNAME_1'], [6, 75, 'O'], [75, 77,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, 'O'], [22, 27, 'BUILDINGNUMBER_1'], [...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, 'O'], [4, 6, 'AGE_1'], [6, 20, 'O'], [...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has..."


In [35]:
# df["label"] = df['privacy_mask'].apply(lambda x:list(eval(x).keys()))

In [11]:
# lbs = sum(df["label"].values.tolist(),[])

In [193]:
idx = df[df.privacy_mask.str.contains('USERAGENT')].sample(1).index.values[0]
print(df.iloc[idx].unmasked_text),idx

Veuillez envoyer une copie des documents de l'IEP pour Nadia.Bartoletti@yahoo.com sur son appareil individuel (Opera/10.58 (Macintosh; Intel Mac OS X 10.6.7 U; FO Presto/2.9.182 Version/10.00)). Veillez à sécuriser les données.


(None, 55405)

In [194]:
print(df.iloc[idx].privacy_mask)

{'[EMAIL_1]': 'Nadia.Bartoletti@yahoo.com', '[USERAGENT_1]': 'Opera/10.58 (Macintosh; Intel Mac OS X 10.6.7 U; FO Presto/2.9.182 Version/10.00)'}


In [62]:
print(df.sample(1).privacy_mask.values[0])

{'[COUNTY_1]': 'Clark County', '[USERNAME_1]': 'Mary36', '[IPV6_1]': '68d7:e0b0:5247:83e7:0c2f:dcc3:87ca:f0ef'}


In [12]:
lbs = np.unique(lbs)
len(lbs)

146

In [13]:
lbs

array(['[ACCOUNTNAME_1]', '[ACCOUNTNAME_2]', '[ACCOUNTNUMBER_1]',
       '[ACCOUNTNUMBER_2]', '[ACCOUNTNUMBER_3]', '[AGE_1]', '[AGE_2]',
       '[AGE_3]', '[AMOUNT_1]', '[AMOUNT_2]', '[AMOUNT_3]', '[BIC_1]',
       '[BITCOINADDRESS_1]', '[BITCOINADDRESS_2]', '[BUILDINGNUMBER_1]',
       '[BUILDINGNUMBER_2]', '[CITY_1]', '[CITY_2]', '[CITY_3]',
       '[COMPANYNAME_1]', '[COMPANYNAME_2]', '[COMPANYNAME_3]',
       '[COMPANYNAME_4]', '[COMPANYNAME_5]', '[COUNTY_1]', '[COUNTY_2]',
       '[COUNTY_3]', '[CREDITCARDCVV_1]', '[CREDITCARDISSUER_1]',
       '[CREDITCARDISSUER_2]', '[CREDITCARDNUMBER_1]',
       '[CREDITCARDNUMBER_2]', '[CREDITCARDNUMBER_3]', '[CURRENCYCODE_1]',
       '[CURRENCYCODE_2]', '[CURRENCYNAME_1]', '[CURRENCYNAME_2]',
       '[CURRENCYSYMBOL_1]', '[CURRENCYSYMBOL_2]', '[CURRENCYSYMBOL_3]',
       '[CURRENCY_1]', '[CURRENCY_2]', '[CURRENCY_3]', '[DATE_1]',
       '[DATE_2]', '[DATE_3]', '[DOB_1]', '[DOB_2]', '[DOB_3]',
       '[EMAIL_1]', '[EMAIL_2]', '[EMAIL_3]', '[ET

In [36]:
ID_NUM = ['ACCOUNTNUMBER','BITCOINADDRESS',"BIC","PASSWORD",'PIN',"VEHICLEVIN","CREDITCARDNUMBER",
          "ETHEREUMADDRESS",'IBAN',"IPV6","LITECOINADDRESS","MASKEDNUMBER"]
ADRESS_STREET = ["BUILDINGNUMBER","SECONDARYADDRESS","STREET","CITY","ZIPCODE","COUNTY"]
MAIL = ['EMAIL']
NAME_STUDENT = ['FIRSTNAME',"LASTNAME","MIDDLENAME"]
PHONE = ['IPV4',"PHONEIMEI","PHONENUMBER"]
USERNAME = ["USERNAME"]

KEEP = ID_NUM + ADRESS_STREET + MAIL + NAME_STUDENT + PHONE + USERNAME
KEEP_DIC = {}

for k in KEEP:
    if k in ID_NUM:
        KEEP_DIC[k] = "ID_NUM"
    elif k in ADRESS_STREET:
        KEEP_DIC[k] = "STREET_ADDRESS"
    elif k in MAIL:
        KEEP_DIC[k] = "EMAIL"
    elif k in NAME_STUDENT:
        KEEP_DIC[k] = "NAME_STUDENT"
    elif k in PHONE:
        KEEP_DIC[k] = "PHONE_NUM"
    elif k in USERNAME:
        KEEP_DIC[k] = "USERNAME"

print(KEEP_DIC)

{'ACCOUNTNUMBER': 'ID_NUM', 'BITCOINADDRESS': 'ID_NUM', 'BIC': 'ID_NUM', 'PASSWORD': 'ID_NUM', 'PIN': 'ID_NUM', 'VEHICLEVIN': 'ID_NUM', 'CREDITCARDNUMBER': 'ID_NUM', 'ETHEREUMADDRESS': 'ID_NUM', 'IBAN': 'ID_NUM', 'IPV6': 'ID_NUM', 'LITECOINADDRESS': 'ID_NUM', 'MASKEDNUMBER': 'ID_NUM', 'BUILDINGNUMBER': 'STREET_ADDRESS', 'SECONDARYADDRESS': 'STREET_ADDRESS', 'STREET': 'STREET_ADDRESS', 'CITY': 'STREET_ADDRESS', 'ZIPCODE': 'STREET_ADDRESS', 'COUNTY': 'STREET_ADDRESS', 'EMAIL': 'EMAIL', 'FIRSTNAME': 'NAME_STUDENT', 'LASTNAME': 'NAME_STUDENT', 'MIDDLENAME': 'NAME_STUDENT', 'IPV4': 'PHONE_NUM', 'PHONEIMEI': 'PHONE_NUM', 'PHONENUMBER': 'PHONE_NUM', 'USERNAME': 'USERNAME'}


In [37]:
def modify_dictionary(original_dict, new_key):
    original_dict = eval(original_dict)
    # Check if the old key exists in the dictionary
#     print(set([x.split('_')[0].replace('[','').replace(']','') for x in original_dict.keys()]))
    com = set(new_key.keys()) & set([x.split('_')[0].replace('[','').replace(']','') for x in original_dict.keys()])
    ec = set([x.split('_')[0].replace('[','').replace(']','') for x in original_dict.keys()]) - set(new_key.keys())
    
    test_street= set(list(com)) & set(["BUILDINGNUMBER","SECONDARYADDRESS","STREET","CITY","ZIPCODE","COUNTY"])
    street_is_ok = 1 if len(test_street)>=3 and "STREET" in test_street else 0
    
    label = []
    tokens = []
    if len(com):
        # Change the old key to the new key
        for k in original_dict.keys():
            kval = k.split('_')[0].replace('[','').replace(']','')
            if kval in com:
                if kval in ["BUILDINGNUMBER","SECONDARYADDRESS","STREET","CITY","ZIPCODE","COUNTY"] and street_is_ok==1:
                    label.append(new_key[k.split('_')[0].replace('[','').replace(']','')])
                    tokens.append(original_dict[k])
                elif kval not in ["BUILDINGNUMBER","SECONDARYADDRESS","STREET","CITY","ZIPCODE","COUNTY"]:
                    label.append(new_key[k.split('_')[0].replace('[','').replace(']','')])
                    tokens.append(original_dict[k])
                else:
                    pass

    return label,tokens

In [66]:
df[df.privacy_mask.str.contains('ACCOUNTNUMBER')].sample(1).privacy_mask.values[0]

"{'[ACCOUNTNUMBER_1]': '07639708', '[TIME_1]': '0', '[IP_1]': '88.140.94.239'}"

In [88]:
idx = df[df.privacy_mask.str.contains('STREET')].sample(1).index.values[0]
modify_dictionary(df.iloc[idx].privacy_mask, KEEP_DIC)

(['NAME_STUDENT',
  'STREET_ADDRESS',
  'STREET_ADDRESS',
  'STREET_ADDRESS',
  'EMAIL',
  'PHONE_NUM'],
 ['Thalia',
  '4967',
  'Madison Avenue',
  'Logan County',
  'Janet.Trantow48@hotmail.com',
  '11-249792-821061-9'])

In [38]:
df["label"] = df['privacy_mask'].apply(lambda x:modify_dictionary(x,KEEP_DIC))

In [39]:
df['label_text'] = df['label'].transform(lambda x:x[0])
df['label_att'] = df['label'].transform(lambda x:x[1])

In [40]:
df.head()

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,label,label_text,label_att
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, 'O'], [57, 75, 'PHONEIMEI_1'], [75, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, ', s, assessment, was, found, on,...","([PHONE_NUM], [06-184755-866851-3])",[PHONE_NUM],[06-184755-866851-3]
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, 'O'], [5, 9, 'FIRSTNAME_1'], [9, 44, '...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ...","([NAME_STUDENT, ID_NUM], [Omer, 78B5R2MVFAHJ48...","[NAME_STUDENT, ID_NUM]","[Omer, 78B5R2MVFAHJ48500]"
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, 'FIRSTNAME_1'], [6, 75, 'O'], [75, 77,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ...","([NAME_STUDENT], [Kattie])",[NAME_STUDENT],[Kattie]
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, 'O'], [22, 27, 'BUILDINGNUMBER_1'], [...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ...","([ID_NUM], [5890724654311332])",[ID_NUM],[5890724654311332]
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, 'O'], [4, 6, 'AGE_1'], [6, 20, 'O'], [...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has...","([ID_NUM], [Y2rWliOhf8Ir])",[ID_NUM],[Y2rWliOhf8Ir]


In [41]:
df.iloc[2].span_labels

"[[0, 6, 'FIRSTNAME_1'], [6, 75, 'O'], [75, 77, 'AGE_1'], [77, 82, 'O'], [82, 97, 'GENDER_1'], [97, 103, 'O'], [103, 117, 'HEIGHT_1'], [117, 118, 'O']]"

In [42]:
df.iloc[2].unmasked_text

'Kattie could you please share your recomndations about vegetarian diet for 72 old Intersex person with 158centimeters?'

In [43]:
# ======================================================================================== #
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}

In [15]:
tokenize_with_spacy(df.iloc[2].unmasked_text, tokenizer=en_tokenizer)

{'tokens': ['Kattie',
  'could',
  'you',
  'please',
  'share',
  'your',
  'recomndations',
  'about',
  'vegetarian',
  'diet',
  'for',
  '72',
  'old',
  'Intersex',
  'person',
  'with',
  '158centimeters',
  '?'],
 'offset_mapping': [(0, 6),
  (7, 12),
  (13, 16),
  (17, 23),
  (24, 29),
  (30, 34),
  (35, 48),
  (49, 54),
  (55, 65),
  (66, 70),
  (71, 74),
  (75, 77),
  (78, 81),
  (82, 90),
  (91, 97),
  (98, 102),
  (103, 117),
  (117, 118)]}

In [44]:
df["tokenised_text"] = df['unmasked_text'].apply(lambda x:tokenize_with_spacy(x,en_tokenizer))

In [45]:
df["tokens"] = df['tokenised_text'].apply(lambda x:x['tokens'])
df["offset_mapping"] = df['tokenised_text'].apply(lambda x:x['offset_mapping'])

In [46]:
def get_labels(x,dic_to_keep):
    d = eval(x['span_labels'])
    z = [((xx[0],xx[1]),dic_to_keep[xx[2].split('_')[0]]) for xx in d if xx[2].split('_')[0] in dic_to_keep.keys() ]
    
    street_is_ok = 1 if "STREET_ADDRESS" in x['label_text'] else 0
    
    z = [(xx[0],xx[1]) for xx in z if (xx[1]!="STREET_ADDRESS") or (xx[1]=="STREET_ADDRESS" and street_is_ok)]
    
    labels = []
    toff = np.array(x['offset_mapping'])
    labels = np.array(["O"]*len(toff),dtype='<U50')
    if len(z):
        woff = np.array([xx[0] for xx in z])
        
        wx1, wx2 = woff.T
        tx1, tx2 = toff.T
        ix1 = np.maximum(wx1[..., None], tx1[None, ...])
        ix2 = np.minimum(wx2[..., None], tx2[None, ...])
        ux1 = np.minimum(wx1[..., None], tx1[None, ...])
        ux2 = np.maximum(wx2[..., None], tx2[None, ...])
        ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1)
        
        
        for i,row in enumerate(ious):
            inds = row.nonzero()[0]
            labels[inds] = z[i][1] 
            
        labels = labels.tolist()
        
    return labels

In [47]:
df['labels'] = df.apply(lambda x: get_labels(x,KEEP_DIC),axis=1)

In [48]:
df.tail()

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,label,label_text,label_att,tokens,offset_mapping,labels
209256,"La nostra università, situata in [STATE_1], si...","La nostra università, situata in Marche, si pr...","{'[STATE_1]': 'Marche', '[JOBTITLE_1]': 'Legac...","[[0, 33, 'O'], [33, 39, 'STATE_1'], [39, 120, ...","[O, O, O, O, O, O, O, O, O, O, O, B-STATE, O, ...","{'tokens': ['La', 'nostra', 'università', ',',...","([NAME_STUDENT], [Reese])",[NAME_STUDENT],[Reese],"[La, nostra, università, ,, situata, in, March...","[(0, 2), (3, 9), (10, 20), (20, 21), (22, 29),...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
209257,Allegato è lo scontrino per il programma educa...,Allegato è lo scontrino per il programma educa...,"{'[CURRENCYNAME_1]': 'New Israeli Sheqel', '[C...","[[0, 110, 'O'], [110, 128, 'CURRENCYNAME_1'], ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","{'tokens': ['Allegato', 'è', 'lo', 'scontrino'...","([], [])",[],[],"[Allegato, è, lo, scontrino, per, il, programm...","[(0, 8), (9, 10), (11, 13), (14, 23), (24, 27)...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
209258,Non dimenticare di segnalare i progressi di ad...,Non dimenticare di segnalare i progressi di ad...,"{'[ZIPCODE_1]': '09318-1647', '[DOB_1]': '1915...","[[0, 81, 'O'], [81, 91, 'ZIPCODE_1'], [91, 101...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","{'tokens': ['Non', 'dimenticare', 'di', 'segna...","([NAME_STUDENT], [Lourdes])",[NAME_STUDENT],[Lourdes],"[Non, dimenticare, di, segnalare, i, progressi...","[(0, 3), (4, 15), (16, 18), (19, 28), (29, 30)...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
209259,"[GENDER_1], abbiamo elaborato la tua richiesta...","Male to female transgender woman, abbiamo elab...",{'[GENDER_1]': 'Male to female transgender wom...,"[[0, 32, 'GENDER_1'], [32, 140, 'O'], [140, 15...","[B-GENDER, I-GENDER, I-GENDER, I-GENDER, I-GEN...","{'tokens': ['Male', 'to', 'female', 'transgend...","([], [])",[],[],"[Male, to, female, transgender, woman, ,, abbi...","[(0, 4), (5, 7), (8, 14), (15, 26), (27, 32), ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
209260,"Infine, come persona interessata all'educazion...","Infine, come persona interessata all'educazion...","{'[URL_1]': 'https://bruised-snob.name', '[USE...","[[0, 105, 'O'], [105, 130, 'URL_1'], [130, 198...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","{'tokens': ['Infine', ',', 'come', 'persona', ...","([ID_NUM], [1598:bede:1f5a:9ffe:641f:c74f:5d77...",[ID_NUM],[1598:bede:1f5a:9ffe:641f:c74f:5d77:fede],"[Infine, ,, come, persona, interessata, all'ed...","[(0, 6), (6, 7), (8, 12), (13, 20), (21, 32), ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [49]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[-1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [147]:
idx = random.choice(df[df.label_text.astype(str)!="[]"].index)

full_text = df.iloc[idx]['unmasked_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
idx

89102

In [148]:
df.iloc[idx]['privacy_mask']

"{'[USERNAME_1]': 'April_Cassin66', '[IPV6_1]': '0d5c:a4bb:bffd:efcf:ecea:30cf:f3ae:fbcf', '[JOBAREA_1]': 'Intranet', '[PASSWORD_1]': 'TtS6t4YIPBbi'}"

In [149]:
df.iloc[idx]['span_labels']

"[[0, 43, 'O'], [43, 57, 'USERNAME_1'], [57, 65, 'O'], [65, 104, 'IPV6_1'], [104, 224, 'O'], [224, 232, 'JOBAREA_1'], [232, 282, 'O'], [282, 294, 'PASSWORD_1'], [294, 295, 'O']]"

In [150]:
# df.iloc[idx]['labels']

In [151]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [152]:
visualize(full_text,offset_mapping_,labels_)

In [104]:
df.head()

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,label,label_text,label_att,tokens,offset_mapping,labels
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, 'O'], [57, 75, 'PHONEIMEI_1'], [75, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","{'tokens': ['A', 'student', ''s', 'assessment'...","([PHONE_NUM], [06-184755-866851-3])",[PHONE_NUM],[06-184755-866851-3],"[A, student, 's, assessment, was, found, on, d...","[(0, 1), (2, 9), (9, 11), (12, 22), (23, 26), ...","[O, O, O, O, O, O, O, O, O, O, O, PHONE_NUM, P..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, 'O'], [5, 9, 'FIRSTNAME_1'], [9, 44, '...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","{'tokens': ['Dear', 'Omer', ',', 'as', 'per', ...","([NAME_STUDENT, ID_NUM], [Omer, 78B5R2MVFAHJ48...","[NAME_STUDENT, ID_NUM]","[Omer, 78B5R2MVFAHJ48500]","[Dear, Omer, ,, as, per, our, records, ,, your...","[(0, 4), (5, 9), (9, 10), (11, 13), (14, 17), ...","[O, NAME_STUDENT, O, O, O, O, O, O, O, O, ID_N..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, 'FIRSTNAME_1'], [6, 75, 'O'], [75, 77,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","{'tokens': ['Kattie', 'could', 'you', 'please'...","([NAME_STUDENT], [Kattie])",[NAME_STUDENT],[Kattie],"[Kattie, could, you, please, share, your, reco...","[(0, 6), (7, 12), (13, 16), (17, 23), (24, 29)...","[NAME_STUDENT, O, O, O, O, O, O, O, O, O, O, O..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, 'O'], [22, 27, 'BUILDINGNUMBER_1'], [...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","{'tokens': ['Emergency', 'supplies', 'in', '16...","([ID_NUM], [5890724654311332])",[ID_NUM],[5890724654311332],"[Emergency, supplies, in, 16356, need, a, refi...","[(0, 9), (10, 18), (19, 21), (22, 27), (28, 32...","[O, O, O, O, O, O, O, O, O, ID_NUM, O, O, O, O..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, 'O'], [4, 6, 'AGE_1'], [6, 20, 'O'], [...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","{'tokens': ['The', '88', 'old', 'child', 'at',...","([ID_NUM], [Y2rWliOhf8Ir])",[ID_NUM],[Y2rWliOhf8Ir],"[The, 88, old, child, at, 5862, ,, has, showca...","[(0, 3), (4, 6), (7, 10), (11, 16), (17, 19), ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [29]:
df = df[df.label_text.astype(str)!="[]"].reset_index(drop=True)
df.shape

(165750, 12)

In [60]:
# df["offset_mapping"] = df["offset_mapping"].transform(lambda x:eval(x))
# df["labels"] = df["labels"].transform(lambda x:eval(x))
# df["tokens"] = df["tokens"].transform(lambda x:eval(x))

In [36]:
df['document'] = np.arange(len(df))

In [37]:
df['full_text'] = df['unmasked_text']

In [38]:
df[['document',"full_text","tokens","labels",'offset_mapping']].to_csv(data_path/'pii-masking-200k.csv',index=False)

In [11]:
import re

# Example string and pattern
txt = "This is a sample text with some pattern."
s = "pattern"

# Find matches using regular expression
res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]

# Check if matches are found
if len(res) > 0:
    st, ed = res[0][0], res[0][1]
    print(f"Match found from index {st} to {ed}: '{txt[st:ed]}'")
else:
    print("No match found.")


Match found from index 32 to 39: 'pattern'
