In [1]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score, recall_score
import numpy as np
from plotly.subplots import make_subplots
import ast
import re

Comparison of all three models (Presidio, Cassed, GPT) on all three datasets for pii detection

In [2]:
def load_data(path, multi=True):
    data = pd.read_csv(path + "/test.csv")
    data_pii = pd.read_csv(path + "/test_labels_pii.csv")
    if multi:
        data_multi = pd.read_csv(path + "/test_labels_multiclass.csv")
    else:
        data_multi = pd.read_csv(path + "/test_labels_multi.csv")
    data_classes = pd.read_csv(path + "/test_classes.csv")
    data_datasets = pd.read_csv(path + "/test_dataset.csv")
    return data, data_pii, data_multi, data_classes, data_datasets

own_data_gpt, own_data_gpt_pii, own_data_gpt_multi, own_data_gpt_classes, own_data_gpt_datasets = load_data("../datasets/dessi-mf/dessi-mf_gpt")
own_data, own_data_pii, own_data_multi, own_data_classes, own_data_datasets = load_data("../datasets/dessi-mf/dessi-mf", multi=False)
test_lan, test_lan_pii, test_lan_multi, test_lan_classes, test_lan_datasets = load_data("../datasets/test_languages")

test_kaggle = pd.read_csv("../datasets/kaggle_datasets/all_datasets.csv")
test_kaggle_pii = pd.read_csv("../datasets/kaggle_datasets/all_datasets_labels_pii.csv")
test_kaggle_dataset = pd.read_csv('../datasets/kaggle_datasets/all_datasets_names.csv')

test_openml = pd.read_csv("../datasets/openml_datasets/all_datasets.csv")
test_openml_pii = pd.read_csv("../datasets/openml_datasets/all_datasets_labels_pii.csv")
test_openml_dataset = pd.read_csv('../datasets/openml_datasets/all_datasets_names.csv')

test_openml_2 = pd.read_csv("../datasets/openml_datasets_2/all_datasets.csv")
test_openml_2_pii = pd.read_csv("../datasets/openml_datasets_2/all_datasets_labels_pii.csv")
test_openml_2_dataset = pd.read_csv('../datasets/openml_datasets_2/all_datasets_names.csv')

GPT predictions

In [3]:
def load_predictions(path):
    with open(path, "r") as f:
        response_text = f.read()
    responses = response_text.split("\n")[1:]
    detected_classes = []
    for j in range(len(responses)):
        match1 = re.search(f"('detected_classes'|\"detected_classes\"|detected_classes): \[(.*?)\]", responses[j])
        detected = match1.group(2).replace("'", "").replace("\"", "").replace(", ", ",")
        if "," in detected:
            detected = sorted(detected.split(","))
            detected = ",".join(detected)
        detected_classes.append(detected)
    semantic_classes = sorted(set([a for a in own_data_gpt_multi["label"] if "," not in a]))
    NON_pii = ["answer", "city", "color", "company", "cpu", "credit_card_provider", "currency", "date", "dish", "drink", 
                    "duration", "EAN_code", "float_number", "gpe", "graphics", "integer_number", "isbn", "manufacturer", "measure_unit", "phone_model", 
                    "programming_language", "resolution", "SWIFT/BIC code", "system_quality_attribute", "url", "user_agent", "version", "word"]
    pii_classes = list(set(semantic_classes) - set(NON_pii))

    pii_transformed = []
    for i in range(len(detected_classes)):
        if any(a in detected_classes[i] for a in pii_classes) and detected_classes[i] != "programming_language":                       
            pii_transformed.append("pii")
        else:
            pii_transformed.append("non-pii")
    return pii_transformed, detected_classes

def load_predictions_dataset(path, data, labels, dataset):
    with open(path, "r") as f:
        response_text = f.read()
    responses = response_text.split("\n")[1:]
    pii = []
    for j in range(data.shape[1]):
        col = data.columns[j]
        col = col.replace("(", "\(").replace(")", "\)").replace("$", "\$")          # escape special characters for pattern matching
        match1 = re.search(f"({col}|'{col}'|\"{col}\")" +
                        f": (true|false|\"true\"|\"false\"|'true'|'false'|True|False|\"True\"|\"False\"|'True'|'False')",
                        responses[j])
        pii.append(match1.group(2))
    pii_transformed = ["non-pii" if "false" in a.lower() else "pii" for a in pii]
    
    results = pd.DataFrame({
        "Name": data.columns,
        "True Label": labels["label"],
        "Prediction": pii_transformed,
        "Dataset": dataset["dataset"]
    })
    return results

results_gpt_own_data, detected_classes_own_dataset = load_predictions("../GPT/gpt_predictions/dessi-mf_results.txt")
results_gpt_df_own_data = pd.DataFrame({
    "Column": own_data_gpt.columns,
    "True Label": own_data_gpt_pii["label"],
    "Prediction": results_gpt_own_data,
    "Classes": own_data_gpt_classes["class"],
    "Predicted Classes": detected_classes_own_dataset,
    "Dataset": own_data_gpt_datasets["dataset"]
})

results_gpt_test_lan, detected_classes_test_lan = load_predictions("../GPT/gpt_predictions/test_languages_results.txt")
results_gpt_df_test_lan = pd.DataFrame({
    "Column": test_lan.columns,
    "True Label": test_lan_pii["label"],
    "Prediction": results_gpt_test_lan,
    "Classes": test_lan_classes["class"],
    "Predicted Classes": detected_classes_test_lan,
    "Dataset": test_lan_datasets["dataset"]
})

results_gpt_df_kaggle= load_predictions_dataset("../GPT/gpt_predictions/kaggle_results.txt", test_kaggle, test_kaggle_pii, test_kaggle_dataset)

results_gpt_df_openml = load_predictions_dataset("../GPT/gpt_predictions/openml_results.txt", test_openml, test_openml_pii, test_openml_dataset)

results_gpt_df_openml_2 = load_predictions_dataset("../GPT/gpt_predictions/openml_2_results.txt", test_openml_2, test_openml_2_pii, test_openml_2_dataset)

CASSED predictions

In [4]:
def extract_prediction(i, predictions):
    list_extractor = re.compile(r"([a-zA-Z0-9\-]+)\s*\(\d+(\.\d+)?\)")
    matches = list_extractor.findall(predictions.iloc[i, :].values[0])
    return [match[0] for match in matches]


def convert_predictions(predictions):
    column_names = []
    gold_list = []
    pred_list = []
    mismatch = []
    column_names.append(predictions.columns[0].split(". ")[0].strip())

    height = predictions.shape[0]

    for i in range(height):
        if 'Gold:' in predictions.iloc[i, :].values[0][:10]:
            gold_val = str(extract_prediction(i, predictions))[2:-2]
            gold_list.append(gold_val)
        elif 'Pred:' in predictions.iloc[i, :].values[0][:10]:
            pred_val = str(extract_prediction(i, predictions))[2:-2]
            if "," in pred_val:         #a "," in the prediction means that the prediction predicted both classes pii and non-pii
                pred_val = "pii"
            pred_list.append(pred_val)
            if i == height-1:
                mismatch.append("No Mismatch")
                break
            if 'MISMATCH' in predictions.iloc[i+1, :].values[0][:20]:
                mismatch.append("Mismatch")
            else:
                mismatch.append("No Mismatch")
        elif 'MISMATCH' not in predictions.iloc[i, :].values[0][:20]:
            column_names.append(predictions.iloc[i, :].values[0].split('. ')[0].strip())
            
    df = pd.DataFrame({
        'Name': column_names,
        'Gold': gold_list,
        'Pred': pred_list,
        'Mismatch': mismatch
    })
    return df

def get_tested_predictions(path):
    data_true = pd.read_csv(path + "/true.csv")
    data_pred = pd.read_csv(path + "/predicted.csv")
    data_target_names = pd.read_csv(path + "/target_names.csv")
    data_target_names = {a[1]: a[0] for a in data_target_names[["0", "0.1"]].values}
    data_target_names[0] = "no prediction"
    predictions, true = [], []
    for i in range(data_true.shape[0]):
        predictions.append([data_target_names[e] for e, b in enumerate(data_pred.iloc[i,:]) if b == 1])
        true.append([data_target_names[e] for e, b in enumerate(data_true.iloc[i,:]) if b == 1])
    return [a[0] for a in true], [a[0] for a in predictions]

results_cassed_own_data = pd.read_csv("../CASSED/CASSED_model_results/cassed_pii/dessi-mf/test.tsv", sep='\t')
results_cassed_own_data = convert_predictions(results_cassed_own_data)
results_cassed_df_own_data = pd.DataFrame({
    "Column": own_data.columns,
    "Prediction": results_cassed_own_data["Pred"],
    "True Label": results_cassed_own_data["Gold"],
    "Classes": own_data_classes["class"],
    "Dataset": own_data_datasets["dataset"]
})

true, predictions = get_tested_predictions("../CASSED/CASSED_model_results/cassed_pii/two_languages")
results_cassed_df_test_lan = pd.DataFrame({
    "Column": test_lan.columns,
    "Prediction": predictions,
    "True Label": test_lan_pii["label"].values,
    "Classes": test_lan_classes["class"],
    "Dataset": test_lan_datasets["dataset"]
})

true, predictions = get_tested_predictions("../CASSED/CASSED_model_results/cassed_pii/kaggle")
results_cassed_df_test_kaggle = pd.DataFrame({
    "Column": test_kaggle.columns,
    "Prediction": predictions,
    "True Label": test_kaggle_pii["label"].values,
    "Dataset": test_kaggle_dataset["dataset"],
})

true, predictions = get_tested_predictions("../CASSED/CASSED_model_results/cassed_pii/openml")
results_cassed_df_test_openml = pd.DataFrame({
    "Column": test_openml.columns,
    "Prediction": predictions,
    "True Label": test_openml_pii["label"].values,
    "Dataset": test_openml_dataset["dataset"],
})

true, predictions = get_tested_predictions("../CASSED/CASSED_model_results/cassed_pii/openml_2")
results_cassed_df_test_openml_2 = pd.DataFrame({
    "Column": test_openml_2.columns,
    "Prediction": predictions,
    "True Label": test_openml_2_pii["label"].values,
    "Dataset": test_openml_2_dataset["dataset"],
})

Presidio predictions

In [5]:
# classes that contain pii information, all other classes not contain pii data
# non personal ["AU_ABN", "AU_ACN", "IP_ADDRESS", "LOCATION", "shared LOCATION", "URL"]
PII_CLASSES = ["AU_MEDICARE", "AU_TFN", "CREDIT_CARD", "EMAIL_ADDRESS", "IBAN_CODE", "IN_AADHAAR", "IN_PAN", "IN_PASSPORT", "IP_ADDRESS", "IN_VEHICLE_REGISTRATION", "MEDICAL_LICENSE",
               "PERSON", "PHONE_NUMBER", "UK_NHS", "US_BANK_NUMBER", "US_DRIVER_LICENSE", "US_ITIN", "US_PASSPORT", "US_SSN", 
               "shared IN_PAN", "shared IN_VEHICLE_REGISTRATION", "shared PERSON", "shared PHONE_NUMBER", "shared US_BANK_NUMBER", "shared US_DRIVER_LICENSE"]

In [6]:
def get_categories_binary(strings):
    if strings == []:
        return "non-pii"
    for s in strings:
        if s in PII_CLASSES:
            return "pii"  
    return "non-pii"

def build_dataframe_binary(results_df, threshold_score, threshold_count):
    height, width = results_df.shape
    results_copy = results_df.copy()
    print(f"Threshold score: {threshold_score}\nThreshold count: {threshold_count}")
    empty_row1 = [[] for _ in range(width)]
    empty_row2 = [[] for _ in range(width)]
    empty_row3 = ["" for _ in range(width)]
    results_copy.loc[height] = empty_row1.copy()
    results_copy.loc[height+1] = empty_row2.copy()
    results_copy.loc[height+2] = empty_row3.copy()
    for i in range(width):
        series = pd.Series([item for sublist in results_copy.iloc[2:height:2, i]
                        for item in ast.literal_eval(sublist)], dtype="object").value_counts()
        for a, b in series.items():
            if b > threshold_count:
                results_copy.iloc[height, i].append(a)
                results_copy.iloc[height+1, i].append(b)
    for i in range(width):
        entities = [re.sub(r'_\d+(\.\d+)?$', '', item)
                    for item in results_copy.iloc[height, i] if float(re.search(r'(\d+\.\d+)', item).group(1)) >= threshold_score]
        mapped_entities = get_categories_binary(entities)
        results_copy.iloc[height+2, i] = mapped_entities
    return results_copy

In [7]:
results_columnwise_own_data = pd.read_csv(f"../Presidio/predictions/binary_results/dessi-mf/results_columnwise.csv")
results_columnwise_own_data.iloc[0,:] = own_data_pii["label"].values
results_columnwise_own_data_thresholds = build_dataframe_binary(results_columnwise_own_data, 0.9, 0)
results_presidio_df_own_data = pd.DataFrame({
    "Column": own_data.columns,
    "Prediction": results_columnwise_own_data_thresholds.loc[results_columnwise_own_data_thresholds.shape[0]-1].values,
    "True Label": results_columnwise_own_data_thresholds.loc[0].values,
    "Classes": own_data_classes["class"],
    "Dataset": own_data_datasets["dataset"]
})

results_columnwise_two_languages = pd.read_csv(f"../Presidio/predictions/binary_results/test_languages_res/results_columnwise.csv")
results_columnwise_two_languages.iloc[0,:] = test_lan_pii["label"].values
results_columnwise_two_languages_thresholds = build_dataframe_binary(results_columnwise_two_languages, 0, 60)
results_presidio_df_two_languages = pd.DataFrame({
    "Column": test_lan.columns,
    "Prediction": results_columnwise_two_languages_thresholds.loc[results_columnwise_two_languages_thresholds.shape[0]-1].values,
    "True Label": results_columnwise_two_languages_thresholds.loc[0].values,
    "Classes": test_lan_classes["class"],
    "Dataset": test_lan_datasets["dataset"]
})

results_columnwise_kaggle = pd.read_csv(f"../Presidio/predictions/binary_results/kaggle/results_columnwise.csv")
results_columnwise_kaggle.iloc[0,:] = test_kaggle_pii["label"].values
results_columnwise_kaggle_thresholds = build_dataframe_binary(results_columnwise_kaggle, 0.1, 20)
results_presidio_df_kaggle= pd.DataFrame({
    "Column": test_kaggle.columns,
    "Prediction": results_columnwise_kaggle_thresholds.loc[results_columnwise_kaggle_thresholds.shape[0]-1].values,
    "True Label": results_columnwise_kaggle_thresholds.loc[0].values,
    "Dataset": test_kaggle_dataset["dataset"],
})

results_columnwise_openml = pd.read_csv(f"../Presidio/predictions/binary_results/openml/results_columnwise.csv")
results_columnwise_openml.iloc[0,:] = test_openml_pii["label"].values
results_columnwise_openml_thresholds = build_dataframe_binary(results_columnwise_openml, 0.4, 70)
results_presidio_df_openml = pd.DataFrame({
    "Column": test_openml.columns,
    "Prediction": results_columnwise_openml_thresholds.loc[results_columnwise_openml_thresholds.shape[0]-1].values,
    "True Label": results_columnwise_openml_thresholds.loc[0].values,
    "Dataset": test_openml_dataset["dataset"],
})

results_columnwise_openml_2 = pd.read_csv(f"../Presidio/predictions/binary_results/openml_2/results_columnwise.csv")
results_columnwise_openml_2.iloc[0,:] = test_openml_2_pii["label"].values
results_columnwise_openml_2_thresholds = build_dataframe_binary(results_columnwise_openml_2, 0.1, 50)
results_presidio_df_openml_2 = pd.DataFrame({
    "Column": test_openml_2.columns,
    "Prediction": results_columnwise_openml_2_thresholds.loc[results_columnwise_openml_2_thresholds.shape[0]-1].values,
    "True Label": results_columnwise_openml_2_thresholds.loc[0].values,
    "Dataset": test_openml_2_dataset["dataset"],
})

Threshold score: 0.9
Threshold count: 0
Threshold score: 0
Threshold count: 60
Threshold score: 0.1
Threshold count: 20
Threshold score: 0.4
Threshold count: 70
Threshold score: 0.1
Threshold count: 50


## Compare Performance

In [17]:
def plot_metrics_results(results_gpt, results_cassed, results_presidio, dataset="DeSSI-MF"):
    fig1 = go.Figure()
    fig2 = make_subplots(rows=1, cols=3, subplot_titles=["GPT-4o", "CASSED", "Presidio"], horizontal_spacing=0.1)
    colorscales = ["Blues", "Reds", "Greens"]
    for e, (results, name) in enumerate(zip([results_gpt, results_cassed, results_presidio], ["GPT-4o", "CASSED", "Presidio"])):
        y_true = results["True Label"]
        y_pred = results["Prediction"]
        y_true_bin = [1 if label == "pii" else 0 for label in y_true]
        y_pred_bin = [1 if label == "pii" else 0 for label in y_pred]

        cm = np.fliplr(confusion_matrix(y_true_bin, y_pred_bin))
        cm_norm = np.fliplr(confusion_matrix(y_true_bin, y_pred_bin, normalize="true") * 100)

        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, pos_label="pii")
        f1_weighted = f1_score(y_true, y_pred, average="weighted")
        f1_macro = f1_score(y_true, y_pred, average="macro")
        precision = precision_score(y_true, y_pred, pos_label="pii", zero_division=0)
        recall = recall_score(y_true, y_pred, pos_label="pii")
        auc_roc = roc_auc_score(y_true_bin, y_pred_bin)

        metrics = {
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Weighted F1": f1_weighted,
            "Macro F1": f1_macro,
            "Accuracy": accuracy,
            "AUC-ROC": auc_roc
        }

        fig1.add_trace(go.Bar(
            x=list(metrics.keys()),
            y=list(metrics.values()), 
            name=name
        ))

        text_values = [
            [f"{cm[0,0]}<br>({cm_norm[0,0]:.2f}%)", f"{cm[0,1]}<br>({cm_norm[0,1]:.2f}%)"],
            [f"{cm[1,0]}<br>({cm_norm[1,0]:.2f}%)", f"{cm[1,1]}<br>({cm_norm[1,1]:.2f}%)"]
        ]
        cm_heatmap = go.Heatmap(
            z=cm, 
            x=["Pred: pii", "Pred: non-pii"], 
            y=["True: non-<br>pii", "True:<br>pii"], 
            colorscale=colorscales[e], 
            showscale=False,
            text=text_values, 
            texttemplate="%{text}",
            textfont={"size":14},
        )


        fig2.add_trace(cm_heatmap, row=1, col=e+1)

    fig1.update_layout(
        title=f"Evaluation Metrics for All Three Models on {dataset}",
        height=400, 
        width=900,
        legend=dict(title="Model")
    )
    fig1.update_yaxes(title_text="Score", range=[0, 1.05])
    fig1.update_xaxes(title_text="Metric")
    fig2.update_layout(
        title=f"Confusion Matrix for All Three Models on {dataset}",
        height=350, 
        width=900
    )
    fig1.show()
    fig2.show()

In [9]:
CLASS_NAMES = results_cassed_df_own_data["Classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def analyse_results(results_gpt, results_cassed, results_presidio, dataset="DeSSI-MF", width=1430, height=740, subplot_widths=[0.5, 0.3, 0.2], ticks=[0, 0.2, 0.4, 0.6, 0.8, 1.0]):
    
    fig1 = make_subplots(rows=3, cols=1, subplot_titles=["GPT-4o Accuracy by Class", "CASSED Accuracy by Class", "Presidio Accuracy by Class"], vertical_spacing=0.075,
                    shared_xaxes=True, y_title="Accuracy")
    
    fig2 = make_subplots(
    rows=1, cols=3, shared_yaxes=True, column_widths=subplot_widths,
    subplot_titles=["Language", "Original Dataset", "Type of<br>Column Name"]
    )
    
    for enu, (results_df, col, name) in enumerate(
        zip([results_gpt.copy(), results_cassed.copy(), results_presidio.copy()], ['#636efa', '#EF553B', '#00cc96'], ["GPT-4o", "CASSED", "Presidio"])):
        cla_new, lan = [], []
        for i in range(len(results_df["Classes"])):
            if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
                cla_new.append(results_df.iloc[i, 3][:-6])
            elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                            ] or "_zh" in results_df.iloc[i,3]:
                cla_new.append(results_df.iloc[i, 3][:-3])
            elif "," in results_df.iloc[i, 3]:
                cla_new.append([','.join(sorted(string.split(','))) for string in [results_df.iloc[i, 3]]][0])
            else:
                cla_new.append(results_df.iloc[i, 3])
                
            if "mixed" in results_df.iloc[i, 3]:
                lan.append("mixed language")
            elif "_fr" == results_df.iloc[i,3][-3:].lower():
                lan.append("french")
            elif "_it" == results_df.iloc[i,3][-3:].lower():
                lan.append("italian")
            elif "_zh" == results_df.iloc[i,3][-3:].lower():
                lan.append("chinese")
            elif "_de" == results_df.iloc[i,3][-3:].lower():
                lan.append("german")
            elif "_en" == results_df.iloc[i,3][-3:].lower():
                lan.append("english")
            else:
                lan.append("dessi data")
        results_df["Classes_new"] = cla_new
        results_df["Language"] = lan
        

        accuracies_lan = {}
        for i in results_df["Language"].unique():
            lan_df = results_df.loc[results_df["Language"] == i]
            accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
        accuracies_cla = {}
        for i in results_df["Classes_new"].unique():
            cla_df = results_df.loc[results_df["Classes_new"] == i]
            accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
        accuracies_dat = {}
        for i in results_df["Dataset"].unique():
            dat_df = results_df.loc[results_df["Dataset"] == i]
            accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
        accuracies_col = {}
        ind = []
        for i in range(results_df.shape[0]):
            if is_valid_word(results_df.iloc[i, 0]):
                ind.append(i)
        accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
        accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                            results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

        accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
        accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
        accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
        accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))
        
        colors = [
        "#d62728",  # Red (High Saturation)
        "#ff9896",  # Red (Low Saturation)
        "#1f77b4",  # Blue (High Saturation)
        "#aec7e8",  # Blue (Low Saturation)
        "#ff7f0e",  # Orange (High Saturation)
        "#ffbb78",  # Orange (Low Saturation)
        "#2ca02c",  # Green (High Saturation)
        "#98df8a",  # Green (Low Saturation)
        "#9467bd",  # Purple (High Saturation)
        "#c5b0d5",  # Purple (Low Saturation)
        ]

        languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "Test Languages" else ["italian", "chinese"]
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(True, 0)
                fig1.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if enu == 0 else False,
                    legendgroup=2*e+ee
                ), row=enu+1, col=1)


        fig2.add_trace(go.Bar(
            x=list(accuracies_lan.keys()),
            y=list(accuracies_lan.values()),
            name=name,
            showlegend=True,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=1)
        fig2.add_trace(go.Bar(
            x=list(accuracies_dat.keys()),
            y=list(accuracies_dat.values()),
            name=name,
            showlegend=False,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=2)
        fig2.add_trace(go.Bar(
            x=["meaningful", "random generated"],
            y=list(accuracies_col.values()),
            name=name,
            showlegend=False,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=3)
    fig1.update_layout(title=f"Accuracy by Semantic Class for pii Predictions on {dataset}", width=width, height=height, barmode="stack", legend=dict(title="Properties of Column"), 
                       yaxis=dict(tickvals=ticks), yaxis2=dict(tickvals=ticks), yaxis3=dict(tickvals=ticks))
    fig1.show()
    fig2.update_layout(width=800, height=400, title=f"Accuracy of Predictions by Language, Dataset, and Column Type on {dataset}", yaxis=dict(title="Accuracy"),
                       legend=dict(title="Model"))
    fig2.show()

In [10]:
plot_metrics_results(results_gpt_df_own_data, results_cassed_df_own_data, results_presidio_df_own_data)

In [11]:
analyse_results(results_gpt_df_own_data, results_cassed_df_own_data, results_presidio_df_own_data)

two languages

In [12]:
plot_metrics_results(results_gpt_df_test_lan, results_cassed_df_test_lan, results_presidio_df_two_languages, dataset="Test Languages")

In [13]:
analyse_results(results_gpt_df_test_lan, results_cassed_df_test_lan, results_presidio_df_two_languages, dataset="Test Languages", width=1100, height=500, subplot_widths=[1/3,1/3,1/3], 
                ticks=[0, 0.25, 0.5, 0.75, 1.0])

kaggle

In [14]:
plot_metrics_results(results_gpt_df_kaggle, results_cassed_df_test_kaggle, results_presidio_df_kaggle, dataset="Kaggle Data")

openml

In [15]:
plot_metrics_results(results_gpt_df_openml, results_cassed_df_test_openml, results_presidio_df_openml, dataset="OpenML Datasets 1")

openml 2 data

In [18]:
plot_metrics_results(results_gpt_df_openml_2, results_cassed_df_test_openml_2, results_presidio_df_openml_2, dataset="OpenML Datasets 2")