In [5]:
import pandas as pd
import plotly.graph_objects as go
import re
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score, recall_score
from plotly.subplots import make_subplots
import os

In [6]:
own_data = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test.csv")
own_data_personal = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_labels_personal.csv")
own_data_classes = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_classes.csv")
own_data_datasets = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_dataset.csv")

In [7]:
def extract_prediction(i, predictions):
    list_extractor = re.compile(r"([a-zA-Z0-9\-]+)\s*\(\d+(\.\d+)?\)")
    matches = list_extractor.findall(predictions.iloc[i, :].values[0])
    return [match[0] for match in matches]


def convert_predictions(predictions):
    column_names = []
    gold_list = []
    pred_list = []
    mismatch = []
    column_names.append(predictions.columns[0].split(". ")[0].strip())

    height = predictions.shape[0]

    for i in range(height):
        if 'Gold:' in predictions.iloc[i, :].values[0][:10]:
            gold_list.append(str(extract_prediction(i, predictions))[2:-2])
        elif 'Pred:' in predictions.iloc[i, :].values[0][:10]:
            pred_list.append(str(extract_prediction(i, predictions))[2:-2])
            if i == height-1:
                mismatch.append("No Mismatch")
                break
            if 'MISMATCH' in predictions.iloc[i+1, :].values[0][:20]:
                mismatch.append("Mismatch")
            else:
                mismatch.append("No Mismatch")
        elif 'MISMATCH' not in predictions.iloc[i, :].values[0][:20]:
            column_names.append(predictions.iloc[i, :].values[0].split('. ')[0].strip())
            
    df = pd.DataFrame({
        'Name': column_names,
        'Gold': gold_list,
        'Pred': pred_list,
        'Mismatch': mismatch
    })
    return df

In [8]:
results_own_data = pd.read_csv("../CASSED_model_results/cassed_personal/dessi-mf/test.tsv", sep='\t')
results_own_data = convert_predictions(results_own_data)

In [9]:
def plot_cassed_results(y_true,y_pred):
    y_true_bin = [1 if label == "personal" else 0 for label in y_true]
    y_pred_bin = [1 if label == "personal" else 0 for label in y_pred]

    cm = confusion_matrix(y_true, y_pred, labels=["personal", "non-personal"])
    cm_norm = confusion_matrix(y_true, y_pred, labels=["personal", "non-personal"], normalize="true")

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred,average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted")
    auc_roc = roc_auc_score(y_true_bin, y_pred_bin)

    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=["Evaluation Metrics", "Confusion Matrix"]
    )

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        showlegend=False
    ), row=1, col=1)

    text_values = [
        [f"{cm[0,0]}<br>{cm_norm[0,0]:.2f}", f"{cm[0,1]}<br>{cm_norm[0,1]:.2f}"],
        [f"{cm[1,0]}<br>{cm_norm[1,0]:.2f}", f"{cm[1,1]}<br>{cm_norm[1,1]:.2f}"]
    ]
    cm_heatmap = go.Heatmap(
        z=cm, 
        x=["Pred: personal", "Pred: non-personal"], 
        y=["True: personal", "True: non-personal"], 
        colorscale="Blues", 
        showscale=False,
        text=text_values, 
        texttemplate="%{text}",
        textfont={"size":20},
    )


    fig.add_trace(cm_heatmap, row=1, col=2)

    fig.update_layout(
        title="Confusion Matrix and Evaluation Metrics",
        height=500, 
        width=1000,
        showlegend=False
    )
    fig.update_yaxes(range=[0, 1.1], row=1, col=1)

    fig.show()
plot_cassed_results(results_own_data["Gold"], results_own_data["Pred"])

In [10]:
results_df_own_data = pd.DataFrame({
    "Column": results_own_data["Name"],
    "Prediction": results_own_data["Pred"],
    "True Label": results_own_data["Gold"],
    "Classes": own_data_classes["class"],
    "Dataset": own_data_datasets["dataset"]
})

In [11]:
#False negatives
results_df_own_data.loc[(results_df_own_data["True Label"] == "personal") & (results_df_own_data["Prediction"] == "non-personal")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset


In [12]:
#False positives
results_df_own_data.loc[(results_df_own_data["True Label"] == "non-personal") & (results_df_own_data["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset


In [13]:
CLASS_NAMES = results_df_own_data["Classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def create_analysis_plot(results_df, dataset="dessi-mf"):
    cla_new, lan = [], []
    for i in range(len(results_df["Classes"])):
        if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
            cla_new.append(results_df.iloc[i, 3][:-6])
        elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                        ] or "_zh" in results_df.iloc[i,3]:
            cla_new.append(results_df.iloc[i, 3][:-3])
        else:
            cla_new.append(results_df.iloc[i, 3])
            
        if "mixed" in results_df.iloc[i, 3]:
            lan.append("mixed language")
        elif "_fr" == results_df.iloc[i,3][-3:].lower():
            lan.append("french")
        elif "_it" == results_df.iloc[i,3][-3:].lower():
            lan.append("italian")
        elif "_zh" == results_df.iloc[i,3][-3:].lower():
            lan.append("chinese")
        elif "_de" == results_df.iloc[i,3][-3:].lower():
            lan.append("german")
        elif "_en" == results_df.iloc[i,3][-3:].lower():
            lan.append("english")
        else:
            lan.append("dessi data")
    results_df["Classes_new"] = cla_new
    results_df["Language"] = lan
    

    accuracies_lan = {}
    for i in results_df["Language"].unique():
        lan_df = results_df.loc[results_df["Language"] == i]
        accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
    accuracies_cla = {}
    for i in results_df["Classes_new"].unique():
        cla_df = results_df.loc[results_df["Classes_new"] == i]
        accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
    accuracies_dat = {}
    for i in results_df["Dataset"].unique():
        dat_df = results_df.loc[results_df["Dataset"] == i]
        accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
    accuracies_col = {}
    ind = []
    for i in range(results_df.shape[0]):
        if is_valid_word(results_df.iloc[i, 0]):
            ind.append(i)
    accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
    accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                        results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

    accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
    accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
    accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
    accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))


    fig = make_subplots(rows=3, cols=1, subplot_titles=["Accuracy per Class", "Number of correct predictions per Class", "Number of false predictions per Class"],
                        shared_xaxes=True)
    fig.add_trace(go.Bar(
        x=list(accuracies_cla.keys()),
        y=list(accuracies_cla.values()),
        marker=dict(color="black"),
        showlegend=False
    ))
    
    colors = [
    "#d62728",  # Red (High Saturation)
    "#ff9896",  # Red (Low Saturation)
    "#1f77b4",  # Blue (High Saturation)
    "#aec7e8",  # Blue (Low Saturation)
    "#ff7f0e",  # Orange (High Saturation)
    "#ffbb78",  # Orange (Low Saturation)
    "#2ca02c",  # Green (High Saturation)
    "#98df8a",  # Green (Low Saturation)
    "#9467bd",  # Purple (High Saturation)
    "#c5b0d5",  # Purple (Low Saturation)
    ]

    languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "two languages" else ["italian", "chinese"]
    for bool_val in [True, False]:
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(bool_val, 0)
                fig.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if bool_val == True else False,
                    legendgroup=2*e+ee
                ), row=2 if bool_val == True else 3, col=1)
    fig.update_layout(title=f"Accuracy per Class for CASSED's personal Predictions on {dataset}", width=1500, height=700, barmode="stack")
    fig.update_yaxes(title_text="Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Amount of<br>correct predictions", row=2, col=1)
    fig.update_yaxes(title_text="Amount of<br>false predictions", row=3, col=1)
    fig.show()

    fig = make_subplots(
        rows=1, cols=3, shared_yaxes=True,
        subplot_titles=["Accuracy per Language", "Accuracy per Dataset", "Accuracy per Column Name"]
    )
    fig.add_trace(go.Bar(
        x=list(accuracies_lan.keys()),
        y=list(accuracies_lan.values()),
        showlegend=False
    ), row=1, col=1)
    fig.add_trace(go.Bar(
        x=list(accuracies_dat.keys()),
        y=list(accuracies_dat.values()),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Bar(
        x=list(accuracies_col.keys()),
        y=list(accuracies_col.values()),
        showlegend=False
    ), row=1, col=3)
    fig.update_layout(width=900, height=500, title=f"Accuracy of CASSED's personal Predictions on {dataset} with respect to different categories")
    fig.show()

In [15]:
create_analysis_plot(results_df_own_data)

# Test two languages

In [16]:
test_lan = pd.read_csv("../../datasets/test_languages/test.csv")
test_lan_personal = pd.read_csv("../../datasets/test_languages/test_labels_personal.csv")
test_lan_classes = pd.read_csv("../../datasets/test_languages/test_classes.csv")
test_lan_datasets = pd.read_csv("../../datasets/test_languages/test_dataset.csv")

In [17]:
two_lan_true = pd.read_csv("../CASSED_model_results/cassed_personal/two_languages/true.csv")
two_lan_pred = pd.read_csv("../CASSED_model_results/cassed_personal/two_languages/predicted.csv")
two_lan_target_names = pd.read_csv("../CASSED_model_results/cassed_personal/two_languages/target_names.csv")
two_lan_target_names = {a[1]: a[0] for a in two_lan_target_names[["0", "0.1"]].values}
two_lan_target_names[0] = "no prediction"

In [18]:
predictions, true = [], []
for i in range(two_lan_true.shape[0]):
    predictions.append([two_lan_target_names[e] for e, b in enumerate(two_lan_pred.iloc[i,:]) if b == 1])
    true.append([two_lan_target_names[e] for e, b in enumerate(two_lan_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [19]:
plot_cassed_results(true, predictions)

In [20]:
results_df_test_lan = pd.DataFrame({
    "Column": test_lan.columns,
    "Prediction": predictions,
    "True Label": test_lan_personal["label"].values,
    "Classes": test_lan_classes["class"],
    "Dataset": test_lan_datasets["dataset"]
})

In [21]:
#False negatives
fn_df = results_df_test_lan.loc[(results_df_test_lan["True Label"] == "personal") & (results_df_test_lan["Prediction"] == "non-personal")]
fn_df

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset
15,jxWkI9X7H6a,non-personal,personal,occupation_it,mimesis
21,title_it_1,non-personal,personal,title_it,mimesis
52,9n7IWUSmTtzQPxC5HC,non-personal,personal,title_it,mimesis
87,ssn_it_1,non-personal,personal,ssn_it,faker


In [22]:
#False positives
results_df_test_lan.loc[(results_df_test_lan["True Label"] == "non-personal") & (results_df_test_lan["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset
6,zh84KpLMcNfA,personal,non-personal,color_it,mimesis
10,7qnQTupqziQPtDu,personal,non-personal,color_zh,mimesis
11,word_zh_1,personal,non-personal,word_zh,mimesis
14,drink_zh_1,personal,non-personal,drink_zh,mimesis
24,color_it_1,personal,non-personal,color_it,mimesis
25,dish_zh_1,personal,non-personal,dish_zh,mimesis
28,answer_zh_1,personal,non-personal,answer_zh,mimesis
32,7Qg84iqh4gVJjr,personal,non-personal,city_zh,mimesis
34,d9Gf2leMeR,personal,non-personal,answer_zh,mimesis
38,dPI1af7W,personal,non-personal,dish_zh,mimesis


In [23]:
create_analysis_plot(results_df_test_lan, dataset="two languages")

# Kaggle Datasets

In [24]:
test_kaggle = pd.read_csv("../../datasets/kaggle_datasets/all_datasets.csv")
test_kaggle_personal = pd.read_csv("../../datasets/kaggle_datasets/all_datasets_labels_personal.csv")

In [25]:
kaggle_true = pd.read_csv("../CASSED_model_results/cassed_personal/kaggle/true.csv")
kaggle_pred = pd.read_csv("../CASSED_model_results/cassed_personal/kaggle/predicted.csv")
kaggle_target_names = pd.read_csv("../CASSED_model_results/cassed_personal/kaggle/target_names.csv")
kaggle_target_names = {a[1]: a[0] for a in kaggle_target_names[["0", "0.1"]].values}
kaggle_target_names[0] = "no prediction"

In [26]:
predictions, true = [], []
for i in range(kaggle_true.shape[0]):
    predictions.append([kaggle_target_names[e] for e, b in enumerate(kaggle_pred.iloc[i,:]) if b == 1])
    true.append([kaggle_target_names[e] for e, b in enumerate(kaggle_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [27]:
plot_cassed_results(true, predictions)

In [28]:
folders = [name for name in os.listdir("../../datasets/kaggle_datasets") if os.path.isdir(os.path.join("../../datasets/kaggle_datasets", name))]
dataset_kaggle, dataset_kaggle_info = [], []
dataset_type = ["pii info", "personal info", "non-personal info", "personal info", "personal info", "pii info", "pii info", "non-personal info",
                "non-personal info", "personal info", "pii info", "non-personal info", "personal info", "pii info", "non-personal info"]
for e, folder in enumerate(folders):
    path = "../../datasets/kaggle_datasets/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_kaggle += [folder] * df.shape[1]
    dataset_kaggle_info += [dataset_type[e]] * df.shape[1]


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.



In [29]:
results_df_test_kaggle = pd.DataFrame({
    "Column": test_kaggle.columns,
    "Prediction": predictions,
    "True Label": test_kaggle_personal["label"].values,
    "Dataset": dataset_kaggle,
    "Dataset Type": dataset_kaggle_info
})

In [30]:
#False negatives
fn_df = results_df_test_kaggle.loc[(results_df_test_kaggle["True Label"] == "personal") & (results_df_test_kaggle["Prediction"] == "non-personal")]
fn_df

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
5,Transportation expense,non-personal,personal,bank_marketing,pii info
6,Distance from Residence to Work,non-personal,personal,bank_marketing,pii info
7,Service time,non-personal,personal,bank_marketing,pii info
8,Age,non-personal,personal,bank_marketing,pii info
9,Work load Average/day,non-personal,personal,bank_marketing,pii info
10,Hit target,non-personal,personal,bank_marketing,pii info
17,Weight,non-personal,personal,bank_marketing,pii info
18,Height,non-personal,personal,bank_marketing,pii info
19,Body mass index,non-personal,personal,bank_marketing,pii info
21,age,non-personal,personal,adult_census,personal info


In [31]:
fn_df["Dataset"].value_counts()

Dataset
house_price            10
bank_marketing          9
adult_census            8
indian_companies        6
heart_disease           6
pixar                   6
student_performance     5
used_car                3
diabetes                2
titanic                 1
Name: count, dtype: int64

In [32]:
#False positives
results_df_test_kaggle.loc[(results_df_test_kaggle["True Label"] == "non-personal") & (results_df_test_kaggle["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
37,Location,personal,non-personal,diabetes,non-personal info
47,Season,personal,non-personal,diabetes,non-personal info
63,emp.var.rate,personal,non-personal,indian_companies,personal info
89,University Rating,personal,non-personal,used_car,pii info
113,country,personal,non-personal,house_price,pii info
114,outcode,personal,non-personal,house_price,pii info
117,bathrooms,personal,non-personal,house_price,pii info
118,bedrooms,personal,non-personal,house_price,pii info
120,livingRooms,personal,non-personal,absenteeism,non-personal info
121,tenure,personal,non-personal,absenteeism,non-personal info


# OpenML

In [33]:
test_openml = pd.read_csv("../../datasets/openml_datasets/all_datasets.csv")
test_openml_personal = pd.read_csv("../../datasets/openml_datasets/all_datasets_labels_personal.csv")

In [34]:
openml_true = pd.read_csv("../CASSED_model_results/cassed_personal/openml/true.csv")
openml_pred = pd.read_csv("../CASSED_model_results/cassed_personal/openml/predicted.csv")
openml_target_names = pd.read_csv("../CASSED_model_results/cassed_personal/openml/target_names.csv")
openml_target_names = {a[1]: a[0] for a in openml_target_names[["0", "0.1"]].values}
openml_target_names[0] = "no prediction"

In [35]:
predictions, true = [], []
for i in range(openml_true.shape[0]):
    predictions.append([openml_target_names[e] for e, b in enumerate(openml_pred.iloc[i,:]) if b == 1])
    true.append([openml_target_names[e] for e, b in enumerate(openml_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [36]:
plot_cassed_results(true, predictions)

In [37]:
folders = [name for name in os.listdir("../../datasets/openml_datasets") if os.path.isdir(os.path.join("../../datasets/openml_datasets", name))]
dataset_openml, dataset_openml_info = [], []
dataset_type = ["pii info", "personal info", "non-personal info", "personal info", "personal info", "pii info", "pii info", "non-personal info",
                "non-personal info", "personal info", "pii info", "non-personal info", "personal info", "pii info", "non-personal info"]
for e, folder in enumerate(folders):
    path = "../../datasets/openml_datasets/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_openml += [folder] * df.shape[1]
    dataset_openml_info += [dataset_type[e]] * df.shape[1]


Columns (9,12,13,14,15) have mixed types. Specify dtype option on import or set low_memory=False.



In [38]:
results_df_test_openml = pd.DataFrame({
    "Column": test_openml.columns,
    "Prediction": predictions,
    "True Label": test_openml_personal["label"].values,
    "Dataset": dataset_openml,
    "Dataset Type": dataset_openml_info
})

In [39]:
#False negatives
results_df_test_openml.loc[(results_df_test_openml["True Label"] == "personal") & (results_df_test_openml["Prediction"] == "non-personal")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
0,User ID,non-personal,personal,FitBit_HeartRate,pii info
4,Date of Birth,non-personal,personal,CSM,personal info
9,Subscription Plan,non-personal,personal,CSM,personal info
10,Payment Information,non-personal,personal,CSM,personal info
11,Renewal Status,non-personal,personal,CSM,personal info
12,Usage Frequency,non-personal,personal,CSM,personal info
13,Purchase History,non-personal,personal,CSM,personal info
14,Favorite Genres,non-personal,personal,CSM,personal info
15,Devices Used,non-personal,personal,CSM,personal info
16,Engagement Metrics,non-personal,personal,HousingPrices,non-personal info


In [40]:
#False positives
results_df_test_openml.loc[(results_df_test_openml["True Label"] == "non-personal") & (results_df_test_openml["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
21,is_home,personal,non-personal,HousingPrices,non-personal info
22,target,personal,non-personal,HousingPrices,non-personal info
24,Month,personal,non-personal,APL_20_24,personal info
25,Day,personal,non-personal,APL_20_24,personal info
27,Referee,personal,non-personal,APL_20_24,personal info
28,team_playing_ft_goals_rolling_mean,personal,non-personal,APL_20_24,personal info
29,team_playing_ht_goals_rolling_mean,personal,non-personal,APL_20_24,personal info
31,team_playing_shots_on_target_rolling_mean,personal,non-personal,APL_20_24,personal info
33,team_playing_corners_won_rolling_mean,personal,non-personal,APL_20_24,personal info
34,team_playing_yellow_cards_rolling_mean,personal,non-personal,APL_20_24,personal info


# OpenML 2

In [41]:
test_openml_2 = pd.read_csv("../../datasets/openml_datasets_2/all_datasets.csv")
test_openml_2_personal = pd.read_csv("../../datasets/openml_datasets_2/all_datasets_labels_personal.csv")

In [42]:
openml_2_true = pd.read_csv("../CASSED_model_results/cassed_personal/openml_2/true.csv")
openml_2_pred = pd.read_csv("../CASSED_model_results/cassed_personal/openml_2/predicted.csv")
openml_2_target_names = pd.read_csv("../CASSED_model_results/cassed_personal/openml_2/target_names.csv")
openml_2_target_names = {a[1]: a[0] for a in openml_2_target_names[["0", "0.1"]].values}
openml_2_target_names[0] = "no prediction"

In [43]:
predictions, true = [], []
for i in range(openml_2_true.shape[0]):
    predictions.append([openml_2_target_names[e] for e, b in enumerate(openml_2_pred.iloc[i,:]) if b == 1])
    true.append([openml_2_target_names[e] for e, b in enumerate(openml_2_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [44]:
plot_cassed_results(true, predictions)

In [45]:
folders = [name for name in os.listdir("../../datasets/openml_datasets_2") if os.path.isdir(os.path.join("../../datasets/openml_datasets_2", name))]
dataset_openml_2, dataset_openml_2_info = [], []
dataset_type = ["pii info", "personal info", "non-personal info", "personal info", "personal info", "pii info", "pii info", "non-personal info",
                "non-personal info", "personal info", "pii info", "non-personal info", "personal info", "pii info", "non-personal info"]
for e, folder in enumerate(folders):
    path = "../../datasets/openml_datasets_2/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_openml_2 += [folder] * df.shape[1]
    dataset_openml_2_info += [dataset_type[e]] * df.shape[1]

In [46]:
results_df_test_openml_2 = pd.DataFrame({
    "Column": test_openml_2.columns,
    "Prediction": predictions,
    "True Label": test_openml_2_personal["label"].values,
    "Dataset": dataset_openml_2,
    "Dataset Type": dataset_openml_2_info
})

In [47]:
#False negatives
results_df_test_openml_2.loc[(results_df_test_openml_2["True Label"] == "personal") & (results_df_test_openml_2["Prediction"] == "non-personal")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
15,age,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
17,fractional,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
18,epss,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
19,lvdd,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
20,wall_score,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
21,wall_index,non-personal,personal,Avocado-Prices-(Augmented),non-personal info
23,class,non-personal,personal,Avocado-Prices-(Augmented),non-personal info


In [48]:
#False positives
results_df_test_openml_2.loc[(results_df_test_openml_2["True Label"] == "non-personal") & (results_df_test_openml_2["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
24,Species,personal,non-personal,Avocado-Prices-(Augmented),non-personal info
30,Sex,personal,non-personal,wine_quality,personal info
31,binaryClass,personal,non-personal,wine_quality,personal info
32,X,personal,non-personal,wine_quality,personal info
33,Y,personal,non-personal,wine_quality,personal info
43,rain,personal,non-personal,nyc-taxi-green-dec-2016,personal info
44,area,personal,non-personal,nyc-taxi-green-dec-2016,personal info
50,Bid_Volume,personal,non-personal,FOREX_chfjpy-minute-Close,pii info
55,Ask_Volume,personal,non-personal,FOREX_chfjpy-minute-Close,pii info
56,Class,personal,non-personal,FOREX_chfjpy-minute-Close,pii info


# Medical

In [49]:
test_medical = pd.read_csv("../../datasets/freiburg-medical/test.csv")
test_medical_personal = pd.read_csv("../../datasets/freiburg-medical/test_labels_personal.csv")

In [50]:
medical_true = pd.read_csv("../CASSED_model_results/cassed_personal/medical/true.csv")
medical_pred = pd.read_csv("../CASSED_model_results/cassed_personal/medical/predicted.csv")
medical_target_names = pd.read_csv("../CASSED_model_results/cassed_personal/medical/target_names.csv")
medical_target_names = {a[1]: a[0] for a in medical_target_names[["0", "0.1"]].values}
medical_target_names[0] = "no prediction"

In [51]:
predictions, true = [], []
for i in range(medical_true.shape[0]):
    predictions.append([medical_target_names[e] for e, b in enumerate(medical_pred.iloc[i,:]) if b == 1])
    true.append([medical_target_names[e] for e, b in enumerate(medical_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [52]:
y_true = true
y_pred = predictions
y_true_bin = [1 if label == "personal" else 0 for label in y_true]
y_pred_bin = [1 if label == "personal" else 0 for label in y_pred]

cm = confusion_matrix(y_true, y_pred, labels=["personal", "non-personal"])
cm_norm = confusion_matrix(y_true, y_pred, labels=["personal", "non-personal"], normalize="true")

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="weighted")
precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)

fig = make_subplots(
    rows=1, cols=2, 
    subplot_titles=["Evaluation Metrics", "Confusion Matrix"]
)

metrics = {
    "weighted Precision": precision,
    "weighted Recall": recall,
    "weighted F1 Score": f1,
    "Accuracy": accuracy
}

fig.add_trace(go.Bar(
    x=list(metrics.keys()),
    y=list(metrics.values()), 
    showlegend=False
), row=1, col=1)

text_values = [
    [f"{cm[0,0]}<br>{cm_norm[0,0]:.2f}", f"{cm[0,1]}<br>{cm_norm[0,1]:.2f}"],
    [f"{cm[1,0]}<br>{cm_norm[1,0]:.2f}", f"{cm[1,1]}<br>{cm_norm[1,1]:.2f}"]
]
cm_heatmap = go.Heatmap(
    z=cm, 
    x=["Pred: personal", "Pred: non-personal"], 
    y=["True: personal", "True: non-personal"], 
    colorscale="Blues", 
    showscale=False,
    text=text_values, 
    texttemplate="%{text}",
    textfont={"size":20},
)


fig.add_trace(cm_heatmap, row=1, col=2)

fig.update_layout(
    title="Confusion Matrix and Evaluation Metrics",
    height=500, 
    width=1000,
    showlegend=False
)
fig.update_yaxes(range=[0, 1.1], row=1, col=1)

fig.show()

In [53]:
results_df_test_medical = pd.DataFrame({
    "Column": test_medical.columns,
    "Prediction": predictions,
    "True Label": test_medical_personal["label"].values
})

In [54]:
#False negatives
results_df_test_medical.loc[(results_df_test_medical["True Label"] == "personal") & (results_df_test_medical["Prediction"] == "non-personal")]

Unnamed: 0,Column,Prediction,True Label
2,admission_type,non-personal,personal
5,birth_date,non-personal,personal
14,first_operation_code,non-personal,personal
20,any_grouped_medication,non-personal,personal
22,age_at_visit,non-personal,personal
23,total_number_operations,non-personal,personal
24,total_opeartion_duration,non-personal,personal
26,major_surgery_duration,non-personal,personal


In [55]:
#False positives
results_df_test_medical.loc[(results_df_test_medical["True Label"] == "non-personal") & (results_df_test_medical["Prediction"] == "personal")]

Unnamed: 0,Column,Prediction,True Label
