In [1]:
import pandas as pd
import re
import numpy as np
import ast
import json
import os

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Evaluation of Presidio results on own datasets

In [2]:
lan_data_labels = pd.read_csv("../../datasets/test_languages/test_labels_pii.csv")
lan_data = pd.read_csv("../../datasets/test_languages/test.csv")
lan_data_classes = pd.read_csv("../../datasets/test_languages/test_classes.csv")
lan_data_dataset = pd.read_csv("../../datasets/test_languages/test_dataset.csv")

kaggle_data_labels = pd.read_csv("../../datasets/kaggle_datasets/all_datasets_labels_pii.csv")
kaggle_data = pd.read_csv("../../datasets/kaggle_datasets/all_datasets.csv")

openml_data_labels = pd.read_csv("../../datasets/openml_datasets/all_datasets_labels_pii.csv")
openml_data = pd.read_csv("../../datasets/openml_datasets/all_datasets.csv")

openml_2_data_labels = pd.read_csv("../../datasets/openml_datasets_2/all_datasets_labels_pii.csv")
openml_2_data = pd.read_csv("../../datasets/openml_datasets_2/all_datasets.csv")

In [3]:
def read_files_binary(method, dataset):
    results = pd.read_csv(f"../predictions/binary_results/{dataset}/results_{method}.csv")
    return results

results_rowwise1_lan = read_files_binary("rowwise1", "test_languages_res")
results_rowwise2_lan = read_files_binary("rowwise2", "test_languages_res")
results_columnwise_lan = read_files_binary("columnwise", "test_languages_res")

results_rowwise1_kaggle = read_files_binary("rowwise1", "kaggle")
results_rowwise2_kaggle = read_files_binary("rowwise2", "kaggle")
results_columnwise_kaggle = read_files_binary("columnwise", "kaggle")

results_rowwise1_openml = read_files_binary("rowwise1", "openml")
results_rowwise2_openml = read_files_binary("rowwise2", "openml")
results_columnwise_openml = read_files_binary("columnwise", "openml")

results_rowwise1_openml_2 = read_files_binary("rowwise1", "openml_2")
results_rowwise2_openml_2 = read_files_binary("rowwise2", "openml_2")
results_columnwise_openml_2 = read_files_binary("columnwise", "openml_2")


results_rowwise1_kaggle.iloc[0,:] = kaggle_data_labels["label"].values
results_rowwise2_kaggle.iloc[0,:] = kaggle_data_labels["label"].values
results_columnwise_kaggle.iloc[0,:] = kaggle_data_labels["label"].values

results_rowwise1_lan.iloc[0,:] = lan_data_labels["label"].values
results_rowwise2_lan.iloc[0,:] = lan_data_labels["label"].values
results_columnwise_lan.iloc[0,:] = lan_data_labels["label"].values

results_rowwise1_openml.iloc[0,:] = openml_data_labels["label"].values
results_rowwise2_openml.iloc[0,:] = openml_data_labels["label"].values
results_columnwise_openml.iloc[0,:] = openml_data_labels["label"].values

results_rowwise1_openml_2.iloc[0,:] = openml_2_data_labels["label"].values
results_rowwise2_openml_2.iloc[0,:] = openml_2_data_labels["label"].values
results_columnwise_openml_2.iloc[0,:] = openml_2_data_labels["label"].values

The Classes of Presidio must be mapped to pii or non-pii data

In [4]:
s = set()
for df in [results_rowwise1_kaggle, results_rowwise2_kaggle, results_columnwise_kaggle, 
           results_rowwise1_lan, results_rowwise2_lan, results_columnwise_lan, 
           results_rowwise1_openml, results_rowwise2_openml, results_columnwise_openml,
           results_rowwise1_openml_2, results_rowwise2_openml_2, results_columnwise_openml_2]:
     for i in range(df.shape[1]):
          for j in range(2,201,2):
               for v in ast.literal_eval(df.iloc[j,i]):
                    v_splitted = v.split(".")[0]
                    s.add(v_splitted)
s

{'AU_ABN_1',
 'AU_ACN_1',
 'AU_MEDICARE_1',
 'AU_TFN_1',
 'CREDIT_CARD_1',
 'DATE_TIME_0',
 'EMAIL_ADDRESS_1',
 'IBAN_CODE_1',
 'IN_AADHAAR_1',
 'IN_PAN_0',
 'IN_VEHICLE_REGISTRATION_0',
 'IP_ADDRESS_0',
 'LOCATION_0',
 'MEDICAL_LICENSE_1',
 'NRP_0',
 'PERSON_0',
 'PHONE_NUMBER_0',
 'UK_NHS_1',
 'URL_0',
 'US_BANK_NUMBER_0',
 'US_DRIVER_LICENSE_0',
 'US_PASSPORT_0',
 'US_SSN_0',
 'shared DATE_TIME_0',
 'shared IN_PAN_0',
 'shared LOCATION_0',
 'shared PERSON_0',
 'shared URL_0',
 'shared US_DRIVER_LICENSE_0'}

In [4]:
# classes that not contain pii info, all other classes are pii-related
NON_pii = ["AU_ABN", "AU_ACN", "DATE_TIME", "LOCATION", "NRP", "URL", "shared DATE_TIME", "shared LOCATION", "shared URL"]

It is very unlikely that the recognized Australian and Indian IDS are correct, Presidio might perform worse

In [5]:
def get_categories_binary(strings):
    if strings == []:
        return "non-pii"
    for s in strings:
        if s not in NON_pii:
            return "pii"  
    return "non-pii"

def build_dataframe_binary(results_df, threshold_score, threshold_count):
    height, width = results_df.shape
    results_copy = results_df.copy()
    print(f"Threshold score: {threshold_score}\nThreshold count: {threshold_count}")
    empty_row1 = [[] for _ in range(width)]
    empty_row2 = [[] for _ in range(width)]
    empty_row3 = ["" for _ in range(width)]
    results_copy.loc[height] = empty_row1.copy()
    results_copy.loc[height+1] = empty_row2.copy()
    results_copy.loc[height+2] = empty_row3.copy()
    for i in range(width):
        series = pd.Series([item for sublist in results_copy.iloc[2:height:2, i]
                        for item in ast.literal_eval(sublist)], dtype="object").value_counts()
        for a, b in series.items():
            if b > threshold_count:
                results_copy.iloc[height, i].append(a)
                results_copy.iloc[height+1, i].append(b)
    for i in range(width):
        entities = [re.sub(r'_\d+(\.\d+)?$', '', item)
                    for item in results_copy.iloc[height, i] if float(re.search(r'(\d+\.\d+)', item).group(1)) >= threshold_score]
        mapped_entities = get_categories_binary(entities)
        results_copy.iloc[height+2, i] = mapped_entities
    return results_copy

def compute_performance_binary(results):
    results_copy = results.copy()
    #map pii/non-pii to 1/0 so that the metrices are meaningful
    y_true = results_copy.loc[0].map(lambda x: 1 if x == "pii" else 0)
    y_pred = results_copy.loc[results_copy.shape[0]-1].map(lambda x: 1 if x == "pii" else 0)
    precision = precision_score(y_true, y_pred, zero_division=0.0)
    recall = recall_score(y_true, y_pred, zero_division=0.0)
    f1 = f1_score(y_true, y_pred, zero_division=0.0)
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    evaluation = {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'roc_auc': roc_auc
    }
    
    return evaluation

Apply threshold on the Presidio score and the amount of detection   
Find out which thresholds and which method yields the best results

In [None]:
order=["lan", "kaggle", "openml", "openml_2"]
file_name = ["two_languages", "kaggle", "openml", "openml_2"]
for e, df in enumerate([results_rowwise1_lan, results_rowwise1_kaggle, results_rowwise1_openml, results_rowwise1_openml_2]):
    evals = dict()
    print("Method 1 Rowwise")
    for threshold_score in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for threshold_count in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
            eval = compute_performance_binary(build_dataframe_binary(df, threshold_score, threshold_count))
            evals[f"{order[e]}_{threshold_score}_{threshold_count}"] = eval
    with open(f"../predictions/binary_thresholds/pii/{file_name[e]}/rowwise1.json", "w") as json_file:
        json.dump(evals, json_file, indent=4)
for e, df in enumerate([results_rowwise2_lan, results_rowwise2_kaggle, results_rowwise2_openml, results_rowwise2_openml_2]):
    evals = dict()
    print("Method 2 Rowwise")
    for threshold_score in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for threshold_count in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
            eval = compute_performance_binary(build_dataframe_binary(df, threshold_score, threshold_count))
            evals[f"{order[e]}_{threshold_score}_{threshold_count}"] = eval
    with open(f"../predictions/binary_thresholds/pii/{file_name[e]}/rowwise2.json", "w") as json_file:
        json.dump(evals, json_file, indent=4)
for e, df in enumerate([results_columnwise_lan, results_columnwise_kaggle,  results_columnwise_openml, results_columnwise_openml_2]):  
    evals = dict()
    print("Method Columnwise")
    for threshold_score in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for threshold_count in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
            eval = compute_performance_binary(build_dataframe_binary(df, threshold_score, threshold_count))
            evals[f"{order[e]}_{threshold_score}_{threshold_count}"] = eval
    with open(f"../predictions/binary_thresholds/pii/{file_name[e]}/columnwise.json", "w") as json_file:
        json.dump(evals, json_file, indent=4)

In [8]:
def create_df_from_json(i, dataset):
    with open(f"../predictions/binary_thresholds/pii/{dataset}/{i}.json", "r") as file:
        data = json.load(file)
    if dataset == "openml_2":
        openml_2 = 1
    else:
        openml_2 = 0
    dataset, threshold_score, threshold_count, f1, precision, recall, accuracy, roc_auc = [], [], [], [], [], [], [], []
    for (d,val) in zip(data, data.values()):
        splitted = d.split("_")
        dataset.append(splitted[0])
        threshold_score.append(splitted[1 + openml_2])
        threshold_count.append(splitted[2 + openml_2])
        f1.append(val["f1"])
        precision.append(val["precision"])
        recall.append(val["recall"])
        accuracy.append(val["accuracy"])
        roc_auc.append(val["roc_auc"])
    data = {
        "dataset": dataset,
        "threshold_score": threshold_score,
        "threshold_count": threshold_count,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "roc_auc": roc_auc
    }
    return pd.DataFrame(data)

In [9]:
df_rowwise1_lan = create_df_from_json("rowwise1", "two_languages")
df_rowwise2_lan = create_df_from_json("rowwise2", "two_languages")
df_columnwise_lan = create_df_from_json("columnwise", "two_languages")

df_rowwise1_kaggle = create_df_from_json("rowwise1", "kaggle")
df_rowwise2_kaggle = create_df_from_json("rowwise2", "kaggle")
df_columnwise_kaggle = create_df_from_json("columnwise", "kaggle")

df_rowwise1_openml = create_df_from_json("rowwise1", "openml")
df_rowwise2_openml = create_df_from_json("rowwise2", "openml")
df_columnwise_openml = create_df_from_json("columnwise", "openml")

df_rowwise1_openml_2 = create_df_from_json("rowwise1", "openml_2")
df_rowwise2_openml_2 = create_df_from_json("rowwise2", "openml_2")
df_columnwise_openml_2 = create_df_from_json("columnwise", "openml_2")

In [10]:
df_lan = pd.concat([df_rowwise1_lan, df_rowwise2_lan, df_columnwise_lan]).reset_index(drop=True)
df_kaggle = pd.concat([df_rowwise1_kaggle, df_rowwise2_kaggle, df_columnwise_kaggle]).reset_index(drop=True)
df_openml = pd.concat([df_rowwise1_openml, df_rowwise2_openml, df_columnwise_openml]).reset_index(drop=True)
df_openml_2 = pd.concat([df_rowwise1_openml_2, df_rowwise2_openml_2, df_columnwise_openml_2]).reset_index(drop=True)


for (df, dataset) in zip([df_lan, df_kaggle, df_openml, df_openml_2], ["two_languages", "kaggle", "openml", "openml_2"]):
    z_values = []
    for s in ["f1", "precision", "recall", "accuracy", "roc_auc"]:
        z_values.append(max(df[f'{s}']))
        z_values.append(min(df[f'{s}']))

    zmin = min(z_values)
    zmax = max(z_values)

    fig = make_subplots(
        rows=5, cols=3, row_titles=("F1-Score", "Precision", "Recall", "Accuracy", "ROC_AUC"), column_titles=("Rowwise 1", "Rowwise 2", "Columnwise"),
        shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.05, horizontal_spacing=0.025, x_title="Threshold score", y_title="Threshold count",
        specs=[[{"type": "contour"}, {"type": "contour"}, {"type": "contour"}],
            [{"type": "contour"}, {"type": "contour"}, {"type": "contour"}],
            [{"type": "contour"}, {"type": "contour"}, {"type": "contour"}],
            [{"type": "contour"}, {"type": "contour"}, {"type": "contour"}],
            [{"type": "contour"}, {"type": "contour"}, {"type": "contour"}]]
    )

    for e, i in enumerate([110, 220, 330]):
        for ee, s in enumerate(["f1", "precision", "recall", "accuracy", "roc_auc"]):
            start_idx = e * 110
            end_idx = start_idx + 110
            df_subset = df.iloc[start_idx:end_idx]
            Z = np.array(df_subset.pivot(index='threshold_count', columns='threshold_score', values=f'{s}'))
            threshold_scores = sorted(df_subset['threshold_score'].unique())
            threshold_counts = sorted(df_subset['threshold_count'].unique())

            fig.add_trace(go.Contour(z=Z, x=threshold_scores, y=threshold_counts, colorscale="Viridis", zmin=zmin, zmax=zmax,
                colorbar=dict(title="F1 score"), contours=dict(showlabels=True)), row=ee+1, col=e+1)

            hover_text = [f"x: {x}, y: {y}, z: {z}" for x, y, z in zip(df_subset['threshold_score'],df_subset['threshold_count'],df_subset[f'{s}'])]

            fig.add_trace(go.Scatter(
                x=df_subset['threshold_score'], y=df_subset['threshold_count'], mode='markers', marker=dict(color='black', symbol='x'),
                text=hover_text, hoverinfo="text", showlegend=False), row=ee+1, col=e+1)

            df_subset[f'{s}']
            max_micro_f1_index = df_subset[f'{s}'].idxmax()  
            max_micro_f1_row = df.loc[max_micro_f1_index]
            max_threshold_score = max_micro_f1_row['threshold_score']
            max_threshold_count = max_micro_f1_row['threshold_count']
            max_micro_f1_score = max_micro_f1_row[f'{s}']
            
            hover_text = [f"x: {x}, y: {y}, z: {z}" for x, y, z in zip([max_threshold_score],[max_threshold_count],[max_micro_f1_score])]
            fig.add_trace(go.Scatter(x=[max_threshold_score], y=[max_threshold_count], marker=dict(color='red', symbol='x'), showlegend=True if e == 0 and ee == 0 else False,
                            mode="markers", text=hover_text, hoverinfo="text", name="best_score", legendgroup="best"), row=ee+1, col=e+1)

    fig.update_layout(
        title=f"Contour Plot of different performance metrics on {dataset} dataset separated by Presidio analyzer approach",
        legend=dict(x=1,y=1.1),
        height=700,
        width=1200
    )

    fig.show()


 --> Best threshold depends on which metric is most important  
 --> In best_threshold.txt are all thresholds noted

# Presidio Evaluation Performance on binary data between analyzer and structured

In [11]:
def predict_analysis_results(data_test, tabular_analysis):
    predictions = []
    for i in data_test.columns:
        if i in tabular_analysis.keys():
            if tabular_analysis[i] in NON_pii:
                predictions.append("non-pii")
            else:
                predictions.append("pii")
        else:
            predictions.append("non-pii")
    return predictions

def compute_performance_structured(data_test, data_test_labels, tabular_analysis):
    y_pred = predict_analysis_results(data_test, tabular_analysis)
    y_pred = [1 if x == "pii" else 0 for x in y_pred ]
    y_true = data_test_labels["label"].map(lambda x: 1 if x == "pii" else 0)
    precision = precision_score(y_true, y_pred, zero_division=0.0)
    recall = recall_score(y_true, y_pred, zero_division=0.0)
    f1 = f1_score(y_true, y_pred, zero_division=0.0)
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return [precision, recall, f1, accuracy, roc_auc]

In [12]:
def visualize_results(approaches_results, data, labels, title, score, count):
    fig = go.Figure()
    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A", "#19D3F3"]
    for ee, (analyzer, analyzer_name) in enumerate(zip(approaches_results, ["rowwise1", "rowwise2", "columnwise"])):
        df_presidio = analyzer.loc[(analyzer["threshold_score"] == score) & (analyzer["threshold_count"] == count)]
        df_presidio = df_presidio.iloc[:,[4,5,3,6,7]]
        fig.add_trace(go.Bar(x=df_presidio.columns, y=df_presidio.values[0], name=f"Presidio {analyzer_name}<br> thresholds {score}/{count}", legendgroup=analyzer_name, 
                                showlegend=True, marker=dict(color=colors[ee+1])))

    for eee, strategy in enumerate(["most_common", "highest_confidence"]):
        with open(f"../predictions/binary_results/{title}/{strategy}.json", "rb") as file:
            tabular_analysis = json.load(file)
        performance = compute_performance_structured(data, labels, tabular_analysis)
        fig.add_trace(go.Bar(x=df_presidio.columns, y=performance, name=f"structured Presidio<br> {strategy}", legendgroup=strategy, showlegend=True, 
                        marker=dict(color=colors[4+eee])))
    fig.update_layout(title=f"Performance Metrics Comparison all Presidio approaches on {title} data", barmode='group', width=1200)
    fig.update_xaxes(title_text="Metrics")
    fig.update_yaxes(title_text="Score")
    fig.show()

In [13]:
visualize_results([df_rowwise1_lan, df_rowwise2_lan, df_columnwise_lan], lan_data, lan_data_labels, "test_languages_res", "0", "60")
visualize_results([df_rowwise1_kaggle, df_rowwise2_kaggle, df_columnwise_kaggle], kaggle_data, kaggle_data_labels, "kaggle", "0.1", "20")
visualize_results([df_rowwise1_openml, df_rowwise2_openml, df_columnwise_openml], openml_data, openml_data_labels, "openml", "0.4", "70")
visualize_results([df_rowwise1_openml_2, df_rowwise2_openml_2, df_columnwise_openml_2], openml_2_data, openml_2_data_labels, "openml_2", "0.1", "50")

In [14]:
def plot_presidio_results(data_pii, results_pii, dataset):
    y_true = data_pii["label"].values
    y_pred = results_pii.loc[results_pii.shape[0]-1].values
    y_true_bin = [1 if label == "pii" else 0 for label in y_true]
    y_pred_bin = [1 if label == "pii" else 0 for label in y_pred]

    cm = confusion_matrix(y_true, y_pred, labels=["pii", "non-pii"])
    cm_norm = confusion_matrix(y_true, y_pred, labels=["pii", "non-pii"], normalize="true")

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted")
    recall = recall_score(y_true, y_pred, average="weighted")
    auc_roc = roc_auc_score(y_true_bin, y_pred_bin)

    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=["Evaluation Metrics", "Confusion Matrix"]
    )

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        showlegend=False
    ), row=1, col=1)
    fig.update_yaxes(range=[0,1.1], row=1, col=1)
    
    text_values = [
        [f"{cm[0,0]}<br>{cm_norm[0,0]:.2f}", f"{cm[0,1]}<br>{cm_norm[0,1]:.2f}"],
        [f"{cm[1,0]}<br>{cm_norm[1,0]:.2f}", f"{cm[1,1]}<br>{cm_norm[1,1]:.2f}"]
    ]
    cm_heatmap = go.Heatmap(
        z=cm, 
        x=["Pred: pii", "Pred: non-pii"], 
        y=["True: pii", "True: non-pii"], 
        colorscale="Blues", 
        showscale=False,
        text=text_values, 
        texttemplate="%{text}",
        textfont={"size":20},
    )


    fig.add_trace(cm_heatmap, row=1, col=2)

    fig.update_layout(
        title=f"Confusion Matrix and Evaluation Metrics for {dataset} data",
        height=500, 
        width=1000,
        showlegend=False
    )

    fig.show()
results_columnwise_lan_final = build_dataframe_binary(results_rowwise1_lan, 0, 60)
results_columnwise_kaggle_final = build_dataframe_binary(results_rowwise2_kaggle, 0.1, 20)
results_columnwise_openml_final = build_dataframe_binary(results_columnwise_openml, 0.4, 70)
results_columnwise_openml_2_final = build_dataframe_binary(results_columnwise_openml_2, 0.1, 50)
plot_presidio_results(lan_data_labels, results_columnwise_lan_final, "two languages")
plot_presidio_results(kaggle_data_labels, results_columnwise_kaggle_final, "kaggle")
plot_presidio_results(openml_data_labels, results_columnwise_openml_final, "openml")
plot_presidio_results(openml_2_data_labels, results_columnwise_openml_2_final, "openml_2")

Threshold score: 0
Threshold count: 60
Threshold score: 0.1
Threshold count: 20
Threshold score: 0.4
Threshold count: 70
Threshold score: 0.1
Threshold count: 50


In [15]:
results_df_lan = pd.DataFrame({
    "Column": lan_data.columns,
    "Prediction": results_columnwise_lan_final.loc[results_columnwise_lan_final.shape[0]-1].values,
    "True Label": lan_data_labels["label"].values,
    "Classes": lan_data_classes["class"],
    "Dataset": lan_data_dataset["dataset"]
})

In [16]:
CLASS_NAMES = results_df_lan["Classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def create_analysis_plot(results_df, dataset="own_data"):
    cla_new, lan = [], []
    for i in range(len(results_df["Classes"])):
        if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
            cla_new.append(results_df.iloc[i, 3][:-6])
        elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                        ] or "_zh" in results_df.iloc[i,3]:
            cla_new.append(results_df.iloc[i, 3][:-3])
        else:
            cla_new.append(results_df.iloc[i, 3])
            
        if "mixed" in results_df.iloc[i, 3]:
            lan.append("mixed language")
        elif "_fr" == results_df.iloc[i,3][-3:].lower():
            lan.append("french")
        elif "_it" == results_df.iloc[i,3][-3:].lower():
            lan.append("italian")
        elif "_zh" == results_df.iloc[i,3][-3:].lower():
            lan.append("chinese")
        elif "_de" == results_df.iloc[i,3][-3:].lower():
            lan.append("german")
        elif "_en" == results_df.iloc[i,3][-3:].lower():
            lan.append("english")
        else:
            lan.append("dessi data")
    results_df["Classes_new"] = cla_new
    results_df["Language"] = lan
    

    accuracies_lan = {}
    for i in results_df["Language"].unique():
        lan_df = results_df.loc[results_df["Language"] == i]
        accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
    accuracies_cla = {}
    for i in results_df["Classes_new"].unique():
        cla_df = results_df.loc[results_df["Classes_new"] == i]
        accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
    accuracies_dat = {}
    for i in results_df["Dataset"].unique():
        dat_df = results_df.loc[results_df["Dataset"] == i]
        accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
    accuracies_col = {}
    ind = []
    for i in range(results_df.shape[0]):
        if is_valid_word(results_df.iloc[i, 0]):
            ind.append(i)
    accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
    accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                        results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

    accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
    accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
    accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
    accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))


    fig = make_subplots(rows=3, cols=1, subplot_titles=["Accuracy per Class", "Number of correct predictions per Class", "Number of false predictions per Class"],
                        shared_xaxes=True)
    fig.add_trace(go.Bar(
        x=list(accuracies_cla.keys()),
        y=list(accuracies_cla.values()),
        marker=dict(color="black"),
        showlegend=False
    ))
    
    colors = [
    "#d62728",  # Red (High Saturation)
    "#ff9896",  # Red (Low Saturation)
    "#1f77b4",  # Blue (High Saturation)
    "#aec7e8",  # Blue (Low Saturation)
    "#ff7f0e",  # Orange (High Saturation)
    "#ffbb78",  # Orange (Low Saturation)
    "#2ca02c",  # Green (High Saturation)
    "#98df8a",  # Green (Low Saturation)
    "#9467bd",  # Purple (High Saturation)
    "#c5b0d5",  # Purple (Low Saturation)
    ]

    languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "two_languages" else ["italian", "chinese"]
    for bool_val in [True, False]:
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(bool_val, 0)
                fig.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if bool_val == True else False,
                    legendgroup=2*e+ee
                ), row=2 if bool_val == True else 3, col=1)
    fig.update_layout(title="Accuracy per Class for Presidio's pii Predictions on own dataset", width=1500, height=700, barmode="stack")
    fig.update_yaxes(title_text="Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Amount of<br>correct predictions", row=2, col=1)
    fig.update_yaxes(title_text="Amount of<br>false predictions", row=3, col=1)
    fig.show()

    fig = make_subplots(
        rows=1, cols=3, shared_yaxes=True,
        subplot_titles=["Accuracy per Language", "Accuracy per Dataset", "Accuracy per Column Name"]
    )
    fig.add_trace(go.Bar(
        x=list(accuracies_lan.keys()),
        y=list(accuracies_lan.values()),
        showlegend=False
    ), row=1, col=1)
    fig.add_trace(go.Bar(
        x=list(accuracies_dat.keys()),
        y=list(accuracies_dat.values()),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Bar(
        x=list(accuracies_col.keys()),
        y=list(accuracies_col.values()),
        showlegend=False
    ), row=1, col=3)
    fig.update_layout(width=900, height=500, title="Accuracy of Presidio's pii Predictions on own dataset with respect to different categories")
    fig.show()

In [17]:
create_analysis_plot(results_df_lan, dataset="two_languages")

No need of looking into more detail in Presidios wrong predictions

In [18]:
folders = [name for name in os.listdir("../../datasets/kaggle_datasets") if os.path.isdir(os.path.join("../../datasets/kaggle_datasets", name))]
dataset_kaggle, dataset_kaggle_info = [], []
dataset_type = ["pii info", "pii info", "non-pii info", "pii info", "pii info", "pii info", "pii info", "non-pii info",
                "non-pii info", "pii info", "pii info", "non-pii info", "pii info", "pii info", "non-pii info"]
for e, folder in enumerate(folders):
    path = "../../datasets/kaggle_datasets/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_kaggle += [folder] * df.shape[1]
    dataset_kaggle_info += [dataset_type[e]] * df.shape[1]

results_df_kaggle = pd.DataFrame({
    "Column": kaggle_data.columns,
    "Prediction": results_columnwise_kaggle_final.loc[results_columnwise_kaggle_final.shape[0]-1].values,
    "True Label": kaggle_data_labels["label"].values,
    "Dataset": dataset_kaggle,
    "Dataset Type": dataset_kaggle_info
})


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.



In [19]:
#False negatives
results_df_kaggle.loc[(results_df_kaggle["True Label"] == "pii") & (results_df_kaggle["Prediction"] == "non-pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
0,ID,non-pii,pii,absenteeism,pii info
86,Serial No.,non-pii,pii,graduate_admissions,pii info
95,id,non-pii,pii,heart_disease,pii info
223,PassengerId,non-pii,pii,titanic,pii info


In [20]:
#False positives
results_df_kaggle.loc[(results_df_kaggle["True Label"] == "non-pii") & (results_df_kaggle["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
37,Location,pii,non-pii,agriculture,non-pii info
47,Season,pii,non-pii,agriculture,non-pii info
70,Gender,pii,non-pii,diabetes,pii info
111,fullAddress,pii,non-pii,house_price,non-pii info
112,postcode,pii,non-pii,house_price,non-pii info
114,outcode,pii,non-pii,house_price,non-pii info
116,longitude,pii,non-pii,house_price,non-pii info
139,CORPORATE_IDENTIFICATION_NUMBER,pii,non-pii,indian_companies,non-pii info
145,DATE_OF_REGISTRATION,pii,non-pii,indian_companies,non-pii info
151,REGISTERED_OFFICE_ADDRESS,pii,non-pii,indian_companies,non-pii info
