In [55]:
import pandas as pd
import ast
import re

from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

import plotly.graph_objects as go

The dessi dataset can be downloaded from   
https://www.kaggle.com/datasets/sensitivedetection/dessi-dataset-for-structured-sensitive-information  
where the only the DeSSI_v2 folder is necessary

# Evaluation - Presidio Confusion Matrix on Dessi Data

Use a Presidio Confusion Matrx to find label errors for Dessi Data  
Use all Predictions of Presidio without any thresholds so that no wrong predictions are ignored

In [56]:
dessi_train_labels = pd.read_csv("DeSSI_v2/train_labels.csv")
dessi_train = pd.read_csv("DeSSI_v2/train.csv")

dessi_dev_labels = pd.read_csv("DeSSI_v2/dev_labels.csv")
dessi_dev = pd.read_csv("DeSSI_v2/dev.csv")

dessi_test_labels = pd.read_csv("DeSSI_v2/test_labels.csv")
dessi_test = pd.read_csv("DeSSI_v2/test.csv")

dessi_all = pd.concat([dessi_train, dessi_dev, dessi_test], axis=1)
dataset_split = ["train"] * dessi_train.shape[1] + ["validation"] * dessi_dev.shape[1] + ["test"] * dessi_test.shape[1]
dataset_split_df = pd.DataFrame(dataset_split).T
dataset_split_df.columns = dessi_all.columns
dessi_all = pd.concat([dessi_all, dataset_split_df]).reset_index(drop=True)

dessi_all_labels = pd.concat([dessi_train_labels, dessi_dev_labels, dessi_test_labels], axis=0).reset_index(drop=True)
dataset_split_df = pd.DataFrame(dataset_split)
dataset_split_df.columns = ["split_type"]
dessi_all_labels = pd.concat([dessi_all_labels, dataset_split_df], axis=1).reset_index(drop=True)


Columns (62,107,161,241,255,324,449,503,686,720,807,812,889,934,1073,1113,1133,1154,1158,1242,1317,1333,1394,1408,1470,1553,1577,1674,1729,1762,1872,1913,1940,2045,2131,2139,2209,2288,2404,2436,2601,2627,2663,2779,2841,2960,2994,3061,3063,3080,3129,3136,3250,3319,3439,3464,3499,3520,3596,3637,3649,3748,3778,3787,3808,3902,3907,3917,3951,4009,4034,4037,4069,4129,4210,4218,4252,4325,4359,4386,4479,4646,4649,4652,4710,4936,4939,5013,5065,5069,5348,5352,5356,5399,5476,5702,5736,5889,5962,6072,6094,6107,6140,6159,6337,6360,6383,6407,6415,6423,6507,6555,6567,6590,6593,6689,6809,6815,6920,6995,7042,7127,7163,7229,7295,7301,7319,7321,7354,7399,7403,7465,7474,7593,7594,7694,7719,7798,7872,7979,8179,8298,8308,8473,8542,8565,8568,8570,8579,8633,8662,8707,8835,8846,8859,8899,8933,9024,9084,9126,9170,9184,9198,9251,9329,9412,9505,9588,9592,9610,9652,9803,9818,9907,9908,9952,9975,10073,10099,10136,10161,10359,10369,10407,10427,10515,10589,10694,10748,10778,10787,10830,10839,10898,11026,11068,11201,

In [57]:
def read_files(method):
    train = pd.read_csv(f"../../Presidio/old_predictions/dessi_results/train/results_{method}.csv")
    val = pd.read_csv(f"../../Presidio/old_predictions/dessi_results/val/results_{method}.csv")
    test = pd.read_csv(f"../../Presidio/old_predictions/dessi_results/test/results_{method}.csv")
    return train, val, test

def concat_results(train, val, test):
    all_results = pd.concat([train, val, test], axis=1)
    dataset_split = ["train"] * dessi_train.shape[1] + ["validation"] * dessi_dev.shape[1] + ["test"] * dessi_test.shape[1]
    dataset_split_df = pd.DataFrame(dataset_split).T
    dataset_split_df.columns = all_results.columns
    return pd.concat([all_results, dataset_split_df]).reset_index(drop=True).T.reset_index(drop=True).T     #reset index and column names

results_method3_train, results_method3_val, results_method3_test = read_files("columnwise")

results_method_3_all = concat_results(results_method3_train, results_method3_val, results_method3_test)

The Classes of Presidio must be mapped to the classes of dessi to compare both

In [58]:
# Mapping dictionary
# IN_VEHICLE_REGISTRATION, IP_ADDRESS, MEDICAL_LICENSE, URL, US_BANK_NUMBER are matched to "Other_data"
CATEGORY_MAP = {
    "AU_TFN": "NIN",
    "AU_MEDICARE": "NIN",
    "IN_AADHAAR": "NIN",
    "IN_PAN": "NIN",
    "IN_PASSPORT": "Passport",
    "UK_NHS": "NIN",
    "US_ITIN": "NIN",
    "US_SSN": "NIN",
    "AU_ABN": "Organization",
    "AU_ACN": "Organization",
    "CREDIT_CARD": "CCN",
    "DATE_TIME": "Date",
    "shared DATE_TIME": "Date",
    "EMAIL_ADDRESS": "Email",
    "IBAN_CODE": "IBAN",
    "LOCATION": ["Geolocation", "Address", "GPE"],
    "shared LOCATION": ["Geolocation", "Address", "GPE"],
    "NRP": ["Nationality", "Religion"],  
    "shared NRP": ["Nationality", "Religion"],
    "PERSON": "Person",
    "shared PERSON": "Person",
    "PHONE_NUMBER": "Phone_number",
    "shared PHONE_NUMBER": "Phone_number",
    "US_DRIVER_LICENSE": "ID_Card",
    "shared US_DRIVER_LICENSE": "ID_Card",
    "US_PASSPORT": "Passport",
    "shared IN_PAN": "NIN",
}

In [59]:
def get_categories(strings):
    if strings == []:
        return ["Other_data"]
    results = []
    for s in strings:
        category = CATEGORY_MAP.get(s, "Other_data")
        if isinstance(category, list):
            results.extend(category) 
        else:
            results.append(category)  
    return results

def convert_labels(label_str):
    return [label.strip() for label in label_str.replace('{', '').replace('}', '').replace("'", "").split(',')]

def build_dataframe(results_df, threshold_score, threshold_count):
    height, width = results_df.shape
    results_copy = results_df.copy()
    print(f"Threshold score: {threshold_score}\nThreshold count: {threshold_count}")
    empty_row1 = [[] for _ in range(width)]
    empty_row2 = [[] for _ in range(width)]
    empty_row3 = [[] for _ in range(width)]
    results_copy.loc[height] = empty_row1.copy()
    results_copy.loc[height+1] = empty_row2.copy()
    results_copy.loc[height+2] = empty_row3.copy()
    for i in range(width):
        series = pd.Series([item for sublist in results_copy.iloc[2:height:2, i]
                        for item in ast.literal_eval(sublist)], dtype="object").value_counts()
        for a, b in series.items():
            if b > threshold_count:
                results_copy.iloc[height, i].append(a)
                results_copy.iloc[height+1, i].append(b)
    for i in range(width):
        entities = [re.sub(r'_\d+(\.\d+)?$', '', item)
                    for item in results_copy.iloc[height, i] if float(re.search(r'(\d+\.\d+)', item).group(1)) >= threshold_score]
        mapped_entities = get_categories(entities)
        results_copy.iloc[height+2, i] = str(set(mapped_entities))

    results_copy.loc[0] = results_copy.loc[0].apply(convert_labels)
    results_copy.loc[height+2] = results_copy.loc[height+2].apply(convert_labels)
    return results_copy

def compute_performance(results, confusion = False):
    results_copy = results.copy()
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(results_copy.loc[0])
    y_pred = mlb.transform(results_copy.loc[results_copy.shape[0]-1])
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=True)
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    evaluation = pd.DataFrame({
        'Class': mlb.classes_,
        'P': precision,
        'R': recall,
        'F1': f1,
        'S': support
    }).sort_values("S", ascending=False).reset_index(drop=True)
    print(f'Micro F1 Score: {f1_micro:.4f}')
    print(f'Macro F1 Score: {f1_macro:.4f}')
    print(f'Weighted F1 Score: {f1_weighted:.4f}')
    
    if confusion == True:
        y_true_df = pd.DataFrame(y_true)
        y_pred_df = pd.DataFrame(y_pred)

        confusion_matrix = []
        for i in range(20):
            pred_true_compare = y_pred_df.loc[y_pred_df.index.isin(y_true_df.loc[y_true_df[i] == 1].index)]
            arr_label = []
            for j in range(20):
                val = pred_true_compare.loc[:, j].value_counts(normalize=True).get(1)
                arr_label.append(val.item() if val != None else 0)
            confusion_matrix.append(arr_label)
        fig = go.Figure(go.Heatmap(z=confusion_matrix, x=mlb.classes_, y=mlb.classes_, colorscale='Blues'))
        fig.update_layout(title_text='Confusion Matrix', height=800, width=800)
        fig.update_xaxes(title="Predicted Label")
        fig.update_yaxes(title="True Label")
        fig.show()
    return evaluation

# Analyze Presidio Confusion Matrix Part 2

Get a confusion matrix without thresholds to compare all Presidio predictions with the real labels  
Try to find label errors in Dessi  
Use columnwise approach as it is similar to CASSED approach and all three approaches perform similar for no thresholds so the approach does not matter

In [60]:
results_method3_thresholds = build_dataframe(results_method_3_all, 0, 0)
evaluation_method3 = compute_performance(results_method3_thresholds, confusion = True) # Best threshold_score and threshold_count

print(evaluation_method3)

Threshold score: 0
Threshold count: 0
Micro F1 Score: 0.2469
Macro F1 Score: 0.2354
Weighted F1 Score: 0.3321


           Class         P         R        F1     S
0     Other_data  0.137213  0.242951  0.175377  6561
1   Phone_number  0.377567  0.990493  0.546727  4418
2        Address  0.169818  0.843240  0.282704  3802
3         Person  0.204139  0.999721  0.339046  3582
4            NIN  0.219195  0.651773  0.328062  3469
5          Email  1.000000  0.995926  0.997959  3191
6           Date  0.153270  0.985622  0.265286  2782
7            GPE  0.104825  0.911142  0.188020  2172
8   Organization  0.001077  0.001445  0.001234  2076
9    Geolocation  0.035701  0.327502  0.064384  2058
10     SWIFT/BIC  1.000000  0.000000  0.000000   240
11          IBAN  0.892019  0.892019  0.892019   213
12           CCN  0.345098  0.807339  0.483516   109
13      Passport  0.046512  0.388889  0.083086   108
14        Gender  1.000000  0.000000  0.000000    94
15      Religion  0.012819  1.000000  0.025313    93
16     Sexuality  1.000000  0.000000  0.000000    92
17   Nationality  0.012543  1.000000  0.024775

The classes gender, organization, Race, SWIFT/BIC and sexuality can't be predicted by Presidio, as a result P/R/F1 is zero for these classes

Analyse Presidio mistakes  
First focus on the false negatives for every class, so look at the predictions which predicted other classes than the actual groundtruth

In [61]:
dessi_labels_cleaned =pd.read_csv("dessi_cleaned/all_labels_cleaned.csv")
wrong_columns = list(pd.read_csv("dessi_cleaned/wrong_columns.csv")["0"])

In [62]:
def analyse_class(class_label, focus="false negatives"):
    reduced_columns = results_method3_thresholds.iloc[:,dessi_all_labels.loc[dessi_all_labels["label"].str.contains(class_label)].index].T
    indi = []
    if focus == "false negatives":
        for i, a in zip(reduced_columns.index, reduced_columns[204]):
            if class_label not in a:
                indi.append(i)
    elif focus == "true positives":
        for i, a in zip(reduced_columns.index, reduced_columns[204]):
            if class_label in a:
                indi.append(i)
    else:
        raise ValueError("Focus should be either 'false negatives' or 'true positives'")
    d = dessi_all.iloc[:,[int(a) for a in reduced_columns.T[indi].columns.values]]
    add_info = reduced_columns.T[indi].iloc[[0,204],:]
    add_info.columns = d.columns
    df = pd.concat([add_info, d]).reset_index(drop=True)
    return df

analyse class gibt standardmäßig die false negatives aus   
first row display the groundtruth label and the second row of the dataframe the predictions, the other rows are the column values

In [63]:
df_geo = analyse_class("Geolocation")
df_geo.head()

Unnamed: 0,switching_center_lat,lat,lng,geo_latitude,lat.1,call_location_lat,latitude,geo_longitude,call_location_lng,mvgdxwhvwzsg,...,latitude.16,long.19,long.20,geo_lat.17,hzrgwloswcvs,lat.56,call_location_lng.8,geo_lon.17,zvneublxkuwy,latdeg.12
0,[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],...,[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation],[Geolocation]
1,"[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Date, Phone_number, ID_Card, Person]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card, Person]",...,"[Phone_number, ID_Card, Person]","[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Phone_number, ID_Card]"
2,27.996344,28.504554,"39.3520555, -16.36384","39.3348093,-74.5492412",-15.573179,27.917121,38.963465,"28.1069604, -15.4784977",-74.572777,-74.789275,...,-15.46146,-15.381301,"28.3902183,-74.6056514",39.5237827; -16.6775098,-16.323926,39.499909,-16.299797,-14.103708,-16.76674,28.0778679; -74.6895013
3,-74.61295,28.439854,"28.9255692, -74.5684209","39.3829683,-15.4114031",-16.31804,-75.027028,39.390552,"39.0730144, -75.2945396",-75.248239,39.334576,...,28.460881,28.116343,"28.4358689,-16.2554518",39.1538267; -16.5459905,28.278912,-16.378117,-17.884363,-15.509731,39.441366,39.1035643; -17.780067
4,-74.766877,-74.618583,"39.1601016, -16.4731214","28.1896586,-75.0695624",28.653639,28.127593,-74.929024,"39.3265654, -74.6814196",39.447195,39.550524,...,-16.416828,-74.729989,"39.4455182,-74.5375321",28.1520454; -74.6308732,-15.507524,-74.549241,39.447195,-74.438212,-74.609143,39.2429523; -74.7195401


In [64]:
df_geo = analyse_class("Geolocation", focus = "true positives")
df_geo.head()

Unnamed: 0,lon,jqfyjomipilf,country,address_line_2,region,lat.2,ynhvrpjtcrgr,address,country.2,qrzebytxdplq,...,fyzdqbiaoyos,hxupupwzgpyg,feapxnqtchxx,lng.19,tzcmejaucdkk,geo_lat.16,country.32,address.7,dadtysfqjbcs,geo_latitude.18
0,[Geolocation],"[Address, Geolocation]","[Address, Geolocation]","[Address, Geolocation]","[Address, Geolocation]","[Geolocation, Address]",[Geolocation],"[Address, Geolocation]","[Address, Geolocation]",[Geolocation],...,"[Geolocation, Address]",[Geolocation],"[Geolocation, Address]",[Geolocation],"[Geolocation, Address]",[Geolocation],"[Address, Geolocation]","[Address, Geolocation]","[Address, Geolocation]","[Geolocation, Address]"
1,"[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Phone_number, Date...",...,"[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Phone_number, Date...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Geolocation, Address, Phone_number, ID_C...","[GPE, Geolocation, Address, Person, Phone_numb...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Person, Phone_numb..."
2,39.4387199;-74.5492412,-74.8062027,27.7753042,28.3718565,-74.5623918,"Via Monte Grappa, Limbiate, Italia",39.3424993;-16.3189551,-15.6565639,-15.6718676,"28.3379563, -17.9135216",...,"Crossroad Road rd. on number 186 in Waterbury,...","28.4801805, -74.5792423",31 Metropolitan Road Dyerville Rhode Island 02908,"28.3359601,-74.8268432","Michael's Way, Montville, United States of Ame...",28.1164633; -74.5108955,-74.6086658,39.4896717,28.4060052,"Strada Frena, Marèo - Enneberg - Marebbe, Tren..."
3,39.4536535;-74.8164334,"Grand Avenue rd. on number 84 in Manchester, N...","Marsvägen, Kalmar, Sverige","Nicoll Street, Washingtonville, United States ...","Jerzego Malchera rd. on number 16C in Rybnik, ...",38.9801188,39.0681271;-15.6741892,"Sunnanvinden, Kristinehamn, Sverige","7 Goldthwaite Road, Worcester, Massachusetts 0...","38.9979279, -75.2291552",...,38.9775039,"39.372182, -74.9848918",-16.3308235,"28.5534111,-75.3949294",39.1566903,28.4170218; -74.9602198,"878 Huntingdon Drive, Niskayuna, New York 12309",38.962682,"Christie Road, Stratford, New Hampshire",27.84977
4,28.3839587;-17.2639769,39.2429523,Warren Avenue rd. on number 235 in Ingrams Cor...,"England Road, Hampton, United States of America","Źródlana 11, Zielona Góra 65-129, województwo ...","Main Street, Catskill, New York",39.333139;-16.7626164,"6 Burwell Street, Saylesville, Rhode Island 02...","Via del Pettirosso, San Giovanni in Marignano,...","27.8471096, -17.7728009",...,"Viale dell'Appennino, Forlì, Italia","39.1538267, -74.8838924",28.143646,"45.117899,-74.532659",-15.4614497,39.11998; -74.5954738,"Young Road, Orwell, United States of America",-75.2103415,28.1273995,28.1044789


Presidio can't really detect langitude/longitude, mostly detects them in combination to a address

In [65]:
df_iban = analyse_class("IBAN")
df_iban.head()

Unnamed: 0,iban number,mzfmlletrdxv,customer acc number.1,inoacc.1,inoacc.3,international bank account number.1,bank account id.2,account iban.2,hudczjnhmmbq,nalogodavac iban.4,...,customer acc number.3,bank code.2,intf iban.2,id acc,zfyvfwsupxeq,iban.1,country code,iban number.1,mppgaklccrne,iban check digits
0,[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],...,[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN],[IBAN]
1,"[GPE, Geolocation, Other_data, Address, Person]","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Geolocation, Other_data, Address, Person]","[Other_data, Person]","[GPE, Nationality, Geolocation, Other_data, Ad...","[Other_data, Religion, Nationality, Person]","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Geolocation, Other_data, Address, Person...","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Other_data, Ad...",...,"[Other_data, Religion, Nationality, Person]","[GPE, Geolocation, Other_data, Address, Person]","[GPE, Geolocation, Other_data, Address, Person]","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Other_data, Ad...","[Other_data, Religion, Nationality, Person]","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Other_data, Ad..."
2,BA840416015950086704,HR8602480941051131138,BA244245765067182584,HR0256373981877346240,RS35930352272499958270,HR2140162044417652268,RS35932411437346641699,RS35404420563738280187,BA419661968912540013,HR9474939221177417985,...,HR9587792497687086919,BA857025456825660826,HR6609266195637307667,BA569464057407615790,RS35989008132187447709,HR0379616540442636669,BA962968814534321653,BA417195272590809392,BA260796772999593264,HR8926892620805084675
3,BA134412479279148046,HR3922940390803749377,BA244077548359752640,HR2562675964356487682,RS35166603313977225607,HR2225242447300030633,RS35450532170765492229,RS35831490430312312174,BA708145992983975770,HR7245635886432688891,...,HR4319804145214829661,BA499003281274255476,HR7659022951318478142,BA455244868363390058,RS35031520741536477774,HR8152998813033338724,BA371621873082454698,BA797866003895190438,BA026533100298042167,HR4909876543594850926
4,BA793135358257818493,HR0083485508782717426,BA684541244572157128,HR4097109324215390551,RS35903128349560592269,HR0962723147170495143,RS35839416621047098575,RS35061982102003878817,BA015929984924931263,HR9486177831953216544,...,HR9776583681436715933,BA573240110223674133,HR7026818920215687263,BA815372306367509722,RS35921084635049545827,HR3640175745526674202,BA617503422688801670,BA503281324307804308,BA008161663787046327,HR3553857547926343664


In [66]:
def validate_iban(iban):
    # Remove spaces and convert to upper case
    iban = iban.replace(' ', '').upper()
    
    # Check that the IBAN length matches the country's expected length (optional but good to have)
    if len(iban) < 15 or len(iban) > 34:
        return False
    
    # Move the first four characters to the end
    rearranged_iban = iban[4:] + iban[:4]
    
    # Replace each letter in the IBAN with two digits (A=10, B=11, ..., Z=35)
    converted_iban = ''
    for char in rearranged_iban:
        if char.isdigit():
            converted_iban += char
        else:
            # Convert letters to numbers (A=10, B=11, ..., Z=35)
            converted_iban += str(ord(char) - 55)
    
    # Convert the string to an integer and check the remainder when divided by 97
    iban_int = int(converted_iban)
    return iban_int % 97 == 1

for a in df_iban.columns:
    ibans = df_iban.loc[2:-1, a].values
    for ib in ibans:
        if validate_iban(ib):
            print("The IBAN checksum is valid.")
            break


In [67]:
indis = []
for e, (i, t) in enumerate(zip(dessi_all.columns, dessi_all.iloc[-1,:])):
    try:
        if (i in df_iban.columns) and (t in df_iban.loc[102,i].values):
            indis.append(e)
    except:
        if (i in df_iban.columns) and (t in df_iban.loc[102,i]):
            indis.append(e)
for ind in indis:
    wrong_columns.append(ind)
    dessi_labels_cleaned.loc[ind,"label"] = "Other_data"

In [68]:
df_nin = analyse_class("NIN")
df_nin.head()

Unnamed: 0,phone,personal_identification_number,date_employed_from,steuer_id,steuer_id.1,bcgvociropcb,date,active_at.1,gfmpxkqdcrsh,pinedgngccdq,...,swrvxibtxbkm,begujibmyytb,national_identity_card_number.4,aoithawxipeb,unxpsmjwhyip,identity_number.5,vetczbhpmiwk,nwkzyulosmgd,personal_numerical_code.12,ztyhgaddzlun
0,"[Phone_number, NIN]",[NIN],"[NIN, Date]",[NIN],"[NIN, Date]",[NIN],"[Date, NIN]","[Date, NIN]","[NIN, Person]","[NIN, Person]",...,[NIN],"[NIN, Email]",[NIN],"[NIN, Date]","[Person, NIN]",[NIN],"[NIN, Date]",[NIN],[NIN],[NIN]
1,"[GPE, Geolocation, Other_data, Address, Phone_...","[Nationality, Person, Phone_number, Date, Reli...","[Date, Phone_number, ID_Card]","[Date, Phone_number, ID_Card]","[Date, Other_data, Phone_number, ID_Card]","[GPE, Nationality, Geolocation, Address, Perso...","[Organization, Other_data, Phone_number, Date,...","[GPE, Geolocation, Address, Person, Date]","[Geolocation, GPE, Address, Person]","[GPE, Nationality, Geolocation, Address, Perso...",...,"[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Nationality, Geolocation, Address, Perso...","[Organization, Other_data, Person, Phone_numbe...","[GPE, Organization, Geolocation, Other_data, A...","[GPE, Nationality, Geolocation, Address, Perso...","[Organization, Other_data, Phone_number, Date,...","[Organization, Other_data, Phone_number, Date,...","[Nationality, Phone_number, Date, Religion, ID...","[Date, Phone_number, ID_Card]"
2,97204538169,ZZ 97 53 18 T,ZZ 58 27 84 T,ZZ053836T,58310247966,COCLLT23A57A214A,01738654925,GODJPT12K26V595O,Ivelisse,Jamiya Mcknight Hussey,...,NLYLBL51B55K102U,KSMZRG58R16T776Y,SKBYYE80R28G171F,"August 29th, 1984",29846703519,QWSKHV04E72X618Q,01-75,36954278108,ZZ 55 65 56 T,ZZ 827141 T
3,91804536723,ZZ382558T,ZZ283276T,ZZ 25 64 92 T,68372591041,PZTLRY83H08H450J,11-1974,26/09/79,PBGETK71V00I631Y,ZZ 483355 T,...,KMPBNA36H64T060G,RIHWXV49E49L513D,QSARZV72P02C629X,86034127953,Tiffany Evan Clemmons Southard,GEOJXI07Y56P651D,92317456082,35621807499,ZZ 481385 T,ZZ 709233 T
4,06294713858,ZZ 80 48 11 T,ZZ059814T,ZZ 07 99 38 T,46238159072,HPAUAZ92L42N508E,86957014232,OIGAEW63U22Y784I,JXTIOT06E93Q850N,ZZ 555608 T,...,RREWJF63Y77B340F,CGYHHS91Y73F919D,BQUAKN64V24I412G,17592304680,04269138578,JIOEOR41Q19D538S,11-02,74530186924,ZZ 073649 T,ZZ641479T


In [69]:
df_nin = analyse_class("NIN", focus="true positives")
df_nin.head()

Unnamed: 0,vcqcujsmuukw,kdxrpomztopy,ssn,ukofatwbaafw,uniquemastercitizennumber,uniform_civil_number,sdnasqkqptur,yshbksmmuokv,contact_email,kugzndtesrqe,...,national_insurance_number.5,order_date.5,mqrhgooelamb,pager.28,drkayupmddmt,personal_identity_number.7,personal_identity_number.8,personal_identification_number.10,personal_code.6,octslrhsuimb
0,[NIN],[NIN],[NIN],"[Date, NIN]","[NIN, Email]",[NIN],"[Phone_number, NIN]",[NIN],"[Email, NIN]","[Person, NIN]",...,"[NIN, Email]","[Date, NIN]","[NIN, Phone_number]","[Phone_number, NIN]",[NIN],[NIN],"[Email, NIN]","[NIN, Email]","[NIN, Person]",[NIN]
1,"[Date, Phone_number, NIN, Organization]","[Phone_number, Date, NIN]","[Organization, Other_data, Phone_number, Date,...","[Phone_number, Date, NIN]","[GPE, Nationality, Geolocation, Other_data, Ad...","[Other_data, Phone_number, Date, ID_Card, NIN]","[GPE, Geolocation, Other_data, Address, Passpo...","[Date, Phone_number, NIN, Organization]","[GPE, Geolocation, Other_data, Address, Person...","[GPE, Organization, Geolocation, Address, Pers...",...,"[Organization, Other_data, Email, Phone_number...","[Phone_number, Date, Person, NIN]","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Other_data, Address, CCN, P...","[Phone_number, Date, NIN]","[Organization, Other_data, Phone_number, Date,...","[GPE, Nationality, Geolocation, Other_data, Ad...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Nationality, Geolocation, Other_data, Ad...","[Phone_number, Date, NIN]"
2,280 344 565,464-24-9836,50264871399,045-05-6709,emilietemi58@jakopec.info,96024785313,62479510384,184 380 269,SEVHCX88V88T507S,522 573 302,...,h.g@smith-green.xyz,206-28-3337,30 18 079 8035,13748569023,185-67-7497,56174298035,XKIOEC08P73W459G,emma-mehta@white-mcgrath.net,Ahmir Adriel Najera Masterson,111-72-1709
3,283 560 449,229-54-2084,68739540210,Apr 16,NWZRNL67G51G474E,83547201693,(084) 1064 7593,177 132 032,k.b@campos.com,Arellano,...,93761250481,16 Jul,+233549869575,+1 (686) 168 09 49,558-85-3611,86152973401,c-b459@gmail.com,86179025430,94387065210,477-19-4481
4,022 534 739,442-71-7382,49813706255,Mar 1,XBNNFS98F68O668D,84765913029,14724802683,142 776 608,dora-sobol@lisica.com.hr,403 813 819,...,94832105679,23 Aug,7619717029,+14-91-981-6952,068-50-7385,38905647124,sebastian-zou10@franklin.store,21370465988,Carissa Devries,091-76-3908


Can't detect specific NINs

In [70]:
df_pass = analyse_class("Passport")
df_pass.head()

Unnamed: 0,verification details,xgyrseezxpfr,passport,idp,british passport,hr passport,dueufvgzbbix,passport_identification_no,cli pspt num,psptno.1,...,unnueytrwbkb,visa number,wpuawkihjvgg,mk pspt num,guosouvxdqwr,xpsosvqziict,customer passport,uk passport number,identification_number_passport,psptid
0,[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],...,[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport],[Passport]
1,"[GPE, Nationality, Geolocation, Address, Date,...","[GPE, Geolocation, Address, Person, ID_Card]","[GPE, Nationality, Geolocation, Address, Perso...","[Address, GPE, ID_Card, Geolocation]","[GPE, Nationality, Geolocation, Other_data, Ad...","[Address, GPE, ID_Card, Geolocation]","[GPE, Geolocation, Address, Person, ID_Card]","[GPE, Geolocation, Address, Date, ID_Card]","[ID_Card, Person]","[Address, GPE, ID_Card, Geolocation]",...,"[GPE, Geolocation, Address, Person, ID_Card]","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Date, ID_Card]","[GPE, Geolocation, Address, Person, ID_Card]","[GPE, Geolocation, Address, Person, Date, ID_C...","[GPE, Geolocation, Other_data, Address, Phone_...","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Geolocation, Address, Person, ID_Card]"
2,W1682102,O5018656,D3319512,D4762856,1183831,U3609154,W2582465,T9665447,O0829812,W2072094,...,W6093143,MD444262,W3924781,KS260799,W5630895,N090850,U7717741,60243679,W8766944,U6108611
3,A1317864,K418723,S0950263,E4277924,9328368,W6922708,O9491883,E5160766,O1711133,A7806146,...,U5507247,MN145334,X8632888,NT847970,O2237222,W6453894,A9314129,51140705,T2632733,K5779386
4,O1382027,O4600267,H2570127,A9154282,97072359,W0476146,N015883,E8877814,W7511940,F684196,...,W6177201,AK197984,G4652166,EF683072,U4024894,W5198503,A4783794,72453327,H2038703,T2528310


Presidio can only detect US Passports

In [71]:
df_per = analyse_class("Person")
df_per.head()

Unnamed: 0,zhiiickzngbe
0,"[Person, NIN]"
1,"[Organization, Other_data, Phone_number, Date,..."
2,37519064825
3,86705213492
4,3452918671


In [72]:
df_per.values

array([[list(['Person', 'NIN'])],
       [list(['Organization', 'Other_data', 'Phone_number', 'Date', 'ID_Card', 'NIN'])],
       [37519064825],
       [86705213492],
       [3452918671],
       [26904385719],
       [16795830420],
       [48021379563],
       [24896350710],
       [43806792516],
       [80497251366],
       [68079415231],
       [51293860478],
       [91374860526],
       [26781390545],
       [29318570649],
       [30762154980],
       [15492736087],
       [67052843914],
       [53978041621],
       [69102843757],
       [58632407198],
       [80961374520],
       [32496185074],
       [38109245764],
       [6372418951],
       [34725981602],
       [24631975803],
       [84971035625],
       [90534678124],
       [97834216506],
       [97453026814],
       [63720948513],
       [7498152367],
       [56037189240],
       [45632798102],
       [32678590147],
       [49037285168],
       [91826504733],
       [7192543683],
       [63815490723],
       [38672401597],
 

In [73]:
indis = []
for e, (i, t) in enumerate(zip(dessi_all.columns, dessi_all.iloc[-1,:])):
    try:
        if (i == "zhiiickzngbe") and (t in df_per.loc[102,i].values):
            indis.append(e)
    except:
        if (i in "zhiiickzngbe") and (t in df_per.loc[102,i]):
            indis.append(e)
for ind in indis:
    wrong_columns.append(ind)
    dessi_labels_cleaned.loc[ind,"label"] = "NIN"
indis

[24764]

In [74]:
df_pho = analyse_class("Phone_number")
df_pho.head()

Unnamed: 0,rdmbvarvwvjz,emailpromotion.6,telephone.5,phone_no.4,phone_number.15,zhignxkecbng,cell_phone_number.16,zfjoaqmkqfem,cellphone_number.23,sms.32,...,phone_ext.5,mobilephonenumber.15,family_contact.13,telephone.34,family_contact_email.24,dohocumdqdlz,kbakskboyent,hzxgmsuvjjav,gwfoopuknpio,ukhxzgzxwdgx
0,[Phone_number],"[Phone_number, Email]",[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],...,[Phone_number],[Phone_number],[Phone_number],[Phone_number],"[Phone_number, NIN]",[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number]
1,"[Date, Other_data, ID_Card, NIN]","[GPE, Nationality, Geolocation, Other_data, Ad...","[Organization, Other_data, Passport, Date, ID_...","[Address, GPE, Date, Geolocation]","[Organization, Other_data, Passport, Date, ID_...","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]",...,"[GPE, Organization, Geolocation, Other_data, A...","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]","[GPE, Nationality, Geolocation, Address, Perso...","[GPE, Nationality, Geolocation, Address, Perso...","[Date, Other_data, ID_Card, NIN]","[Organization, Other_data, Passport, Date, ID_...","[Organization, Other_data, Passport, Date, ID_...","[Date, Other_data, ID_Card, NIN]","[Date, Other_data, ID_Card, NIN]"
2,445314460907,cmiller2044@aguilar.ly,243048296,4O0229532323,557522686,233281895900,447700900543,447700900081,233555144749,233541752803,...,500351531,233551456861,233279471352,+234 1 012 6009,SVMYQX46D36U367M,233260449189,546075770,270734431,440444760765,233282697405
3,440544560923,bharman@martinez.online,243841266,1O8044576295,557396579,233283671760,447700900636,447700900829,233557989303,233547322019,...,503336796,233557909482,233274084351,+234 1 662 3967,IZUDDQ97L62I775S,233266458589,541816952,276131641,441054660784,233284078101
4,444534460923,dvladusic@gmail.com,241699147,8O2077646592,553231611,233286135584,447700900546,447700900967,233558106237,233547151949,...,509905084,233552491728,233277175792,+234 1 525 7813,FCZRWN55W26S618S,233266021864,549924762,279367956,448624860123,233285618272


In [75]:
for i in ["emailpromotion.6", "phone_no.4", "jbubejnymgfj", "email.96", "immrqliobmwo", "family_contact_email.24"]:
    print(df_pho[i].values)

[list(['Phone_number', 'Email'])
 list(['GPE', 'Nationality', 'Geolocation', 'Other_data', 'Address', 'Person', 'Email', 'Date', 'Religion', 'ID_Card', 'NIN'])
 'cmiller2044@aguilar.ly' 'bharman@martinez.online' 'dvladusic@gmail.com'
 'kalipalmer874@krause-howe.org' 'pia.stoiljkovic87@yahoo.hr'
 'joshua-whelan@dunn.com' 'everetteharkins@green.info' 'a-j456@yahoo.com'
 'h.seaman877@hotmail.com' 'h.higgins301@hall-hughes.net'
 'ivicabosiljevac@hotmail.com' 'm.w@preston-anderson.org' 'ac@yahoo.com'
 'ah@jensen-green.us' 'nf@gmail.com' 'e.b@gmail.com'
 'n-a9@mcknight-butler.online' 'r.r4635@green.biz'
 'lex.lemons@hotmail.com' 'm-rash@curry-williams.site'
 'j.roberts@hotmail.com' 'p-jones22@wood.xyz' 'mhowell5457@dawson.xyz'
 'zjordan735@patton-roberts.biz' 'm-ranic3@inet.hr' 'iva-dimic@zagreb.hr'
 'carolinemartinez@clay-thomas.info' 'v.p@gmail.com'
 'kimberlyray743@gmail.com' 'charlottehoffman@gmail.com'
 'jayla-martinez@brewer-harris.us' 'mw@hotmail.com'
 'z-licul9067@gmail.com' 'lstewar

In [76]:
indis = []
cols = ["emailpromotion.6", "jbubejnymgfj", "family_contact_email.24"]
for e, (i, t) in enumerate(zip(dessi_all.columns, dessi_all.iloc[-1,:])):
    try:
        if (i in cols) and (t in df_pho.loc[102,i].values):
            indis.append(e)
    except:
        if (i in cols) and (t in df_pho.loc[102,i]):
            indis.append(e)
for ind in indis:
    wrong_columns.append(ind)
len(indis)

3

In [77]:
dessi_labels_cleaned.loc[indis,"label"]

1040     Phone_number,Email
7836     Phone_number,Email
24064      Phone_number,NIN
Name: label, dtype: object

In [78]:
dessi_labels_cleaned.loc[indis[0],"label"] = "Email"
dessi_labels_cleaned.loc[indis[1],"label"] = "Email"
dessi_labels_cleaned.loc[indis[2],"label"] = "NIN"

In [79]:
df_pho = analyse_class("Phone_number", focus="true positives")
df_pho.head()

Unnamed: 0,fxvspjlbgrhm,phone_number,phone_no,family_email,rrubbcpjrqht,phone_ext,ijfsukxdjfhd,bndecuspucsy,homephone,cell_phone_number,...,cell_phone_number.19,ofyzhctfwwoa,zsjibvlwpvuq,bwghofaruydq,mobilephonenumber.36,usmvdyjshjrb,cell_phone_number.20,phone_ext.23,itittmxlzqjr,customer/contact
0,[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],"[Email, Phone_number]",[Phone_number],...,[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],[Phone_number],"[Person, Phone_number]"
1,"[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Other_data, Address, Person...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Other_data, Address, Phone_...","[GPE, Geolocation, Other_data, Address, Person...","[GPE, Geolocation, Address, Phone_number, Date]",...,"[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Other_data, Address, Person...","[Phone_number, Date]","[GPE, Geolocation, Organization, Other_data, A...","[GPE, Geolocation, Other_data, Address, Passpo...","[GPE, Geolocation, Other_data, Address, Passpo...","[GPE, Geolocation, Other_data, Address, Phone_...","[GPE, Geolocation, Other_data, Address, Passpo...","[GPE, Geolocation, Organization, Other_data, A..."
2,+55 (031) 4280-4254,(071) 1428-1198,31 0020-5087,152691757/5,0209981558,(470)4960902,(051) 2074-9163,13840749598,19081 0 24 14 18 94,+91 6978 5766 35,...,5780/ 22 04 07,021 841 629,+163 6 440 5217,(2804) 038 52 73 -,+92 (165) 75254-48,18880977774,097 677-48-50,31 4374 3014,334-314-6849x206,Mae/ +44(0)1514960632
3,+23 (0511) 2695,2576223255,+55 93 99082 4085,+45 (65) 90179-6903,721 0879103,2316208510,+97 (156) 935900,+51 61 8358 3615,danica-dereta3@xnet.hr,+06 3801 3028 32,...,0288658117,+233244463951,416.737.9285x710,(5575) 291 19 64 -,5329626342,(342)392-8081,6120496814,06415160231,"9652281017, 5500582145",Kenlee/ +44(0)113 496 0895
4,(251) 724-6950 x782,1415554659,+233554323182,044 209 709,+447700900238,1602520745,5622274934,+55 90 96141 4013,s.d30@yahoo.com,+64 1530 1149 85,...,935-837-2993x992,03406 670 0584,31 1792-0744,(2992) 707 68 30 -,+55 (071) 3375-7834,0279340988,+233567504076,+233579234929,1350292495,Roisin Harlow Archer/ 01 883 1767


Presidio may needs a + or spaces to detect phone numbers

It is very difficult to compare Presidio with CASSED as CASSED has classes like Sexuality which Presidio can't detect

In [80]:
pd.DataFrame(wrong_columns).to_csv("dessi_cleaned/wrong_columns.csv", index=False)
dessi_labels_cleaned.to_csv("dessi_cleaned/all_labels_cleaned.csv", index=False)

Continuation in 'find_label_errors_3.ipynb'