In [1]:
import pandas as pd
import plotly.graph_objects as go
import re
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score, recall_score
from plotly.subplots import make_subplots
import os

In [2]:
own_data = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test.csv")
own_data_pii = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_labels_pii.csv")
own_data_classes = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_classes.csv")
own_data_datasets = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_dataset.csv")

In [3]:
def extract_prediction(i, predictions):
    list_extractor = re.compile(r"([a-zA-Z0-9\-]+)\s*\(\d+(\.\d+)?\)")
    matches = list_extractor.findall(predictions.iloc[i, :].values[0])
    return [match[0] for match in matches]


def convert_predictions(predictions):
    column_names = []
    gold_list = []
    pred_list = []
    mismatch = []
    column_names.append(predictions.columns[0].split(". ")[0].strip())

    height = predictions.shape[0]

    for i in range(height):
        if 'Gold:' in predictions.iloc[i, :].values[0][:10]:
            gold_list.append(str(extract_prediction(i, predictions))[2:-2])
        elif 'Pred:' in predictions.iloc[i, :].values[0][:10]:
            pred_list.append(str(extract_prediction(i, predictions))[2:-2])
            if i == height-1:
                mismatch.append("No Mismatch")
                break
            if 'MISMATCH' in predictions.iloc[i+1, :].values[0][:20]:
                mismatch.append("Mismatch")
            else:
                mismatch.append("No Mismatch")
        elif 'MISMATCH' not in predictions.iloc[i, :].values[0][:20]:
            column_names.append(predictions.iloc[i, :].values[0].split('. ')[0].strip())
            
    df = pd.DataFrame({
        'Name': column_names,
        'Gold': gold_list,
        'Pred': pred_list,
        'Mismatch': mismatch
    })
    return df

In [4]:
results_own_data = pd.read_csv("../CASSED_model_results/cassed_pii/dessi-mf/test.tsv", sep='\t')
results_own_data = convert_predictions(results_own_data)

In [5]:
def plot_cassed_results(y_true,y_pred):
    y_true_bin = [1 if label == "pii" else 0 for label in y_true]
    y_pred_bin = [1 if label == "pii" else 0 for label in y_pred]

    cm = confusion_matrix(y_true, y_pred, labels=["pii", "non-pii"])
    cm_norm = confusion_matrix(y_true, y_pred, labels=["pii", "non-pii"], normalize="true")

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred,  average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted")
    auc_roc = roc_auc_score(y_true_bin, y_pred_bin)

    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=["Evaluation Metrics", "Confusion Matrix"]
    )

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        showlegend=False
    ), row=1, col=1)

    text_values = [
        [f"{cm[0,0]}<br>{cm_norm[0,0]:.2f}", f"{cm[0,1]}<br>{cm_norm[0,1]:.2f}"],
        [f"{cm[1,0]}<br>{cm_norm[1,0]:.2f}", f"{cm[1,1]}<br>{cm_norm[1,1]:.2f}"]
    ]
    cm_heatmap = go.Heatmap(
        z=cm, 
        x=["Pred: pii", "Pred: non-pii"], 
        y=["True: pii", "True: non-pii"], 
        colorscale="Blues", 
        showscale=False,
        text=text_values, 
        texttemplate="%{text}",
        textfont={"size":20},
    )


    fig.add_trace(cm_heatmap, row=1, col=2)

    fig.update_layout(
        title="Confusion Matrix and Evaluation Metrics",
        height=500, 
        width=1000,
        showlegend=False
    )
    fig.update_yaxes(range=[0, 1.1], row=1, col=1)

    fig.show()
plot_cassed_results(results_own_data["Gold"], results_own_data["Pred"])

In [6]:
results_df_own_data = pd.DataFrame({
    "Column": results_own_data["Name"],
    "Prediction": results_own_data["Pred"],
    "True Label": results_own_data["Gold"],
    "Classes": own_data_classes["class"],
    "Dataset": own_data_datasets["dataset"]
})

In [7]:
#False negatives
results_df_own_data.loc[(results_df_own_data["True Label"] == "pii") & (results_df_own_data["Prediction"] == "non-pii")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset


In [8]:
#False positives
results_df_own_data.loc[(results_df_own_data["True Label"] == "non-pii") & (results_df_own_data["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset


In [9]:
CLASS_NAMES = results_df_own_data["Classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def create_analysis_plot(results_df, dataset="dessi-mf"):
    cla_new, lan = [], []
    for i in range(len(results_df["Classes"])):
        if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
            cla_new.append(results_df.iloc[i, 3][:-6])
        elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                        ] or "_zh" in results_df.iloc[i,3]:
            cla_new.append(results_df.iloc[i, 3][:-3])
        else:
            cla_new.append(results_df.iloc[i, 3])
            
        if "mixed" in results_df.iloc[i, 3]:
            lan.append("mixed language")
        elif "_fr" == results_df.iloc[i,3][-3:].lower():
            lan.append("french")
        elif "_it" == results_df.iloc[i,3][-3:].lower():
            lan.append("italian")
        elif "_zh" == results_df.iloc[i,3][-3:].lower():
            lan.append("chinese")
        elif "_de" == results_df.iloc[i,3][-3:].lower():
            lan.append("german")
        elif "_en" == results_df.iloc[i,3][-3:].lower():
            lan.append("english")
        else:
            lan.append("dessi data")
    results_df["Classes_new"] = cla_new
    results_df["Language"] = lan
    

    accuracies_lan = {}
    for i in results_df["Language"].unique():
        lan_df = results_df.loc[results_df["Language"] == i]
        accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
    accuracies_cla = {}
    for i in results_df["Classes_new"].unique():
        cla_df = results_df.loc[results_df["Classes_new"] == i]
        accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
    accuracies_dat = {}
    for i in results_df["Dataset"].unique():
        dat_df = results_df.loc[results_df["Dataset"] == i]
        accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
    accuracies_col = {}
    ind = []
    for i in range(results_df.shape[0]):
        if is_valid_word(results_df.iloc[i, 0]):
            ind.append(i)
    accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
    accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                        results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

    accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
    accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
    accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
    accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))


    fig = make_subplots(rows=3, cols=1, subplot_titles=["Accuracy per Class", "Number of correct predictions per Class", "Number of false predictions per Class"],
                        shared_xaxes=True)
    fig.add_trace(go.Bar(
        x=list(accuracies_cla.keys()),
        y=list(accuracies_cla.values()),
        marker=dict(color="black"),
        showlegend=False
    ))
    
    colors = [
    "#d62728",  # Red (High Saturation)
    "#ff9896",  # Red (Low Saturation)
    "#1f77b4",  # Blue (High Saturation)
    "#aec7e8",  # Blue (Low Saturation)
    "#ff7f0e",  # Orange (High Saturation)
    "#ffbb78",  # Orange (Low Saturation)
    "#2ca02c",  # Green (High Saturation)
    "#98df8a",  # Green (Low Saturation)
    "#9467bd",  # Purple (High Saturation)
    "#c5b0d5",  # Purple (Low Saturation)
    ]

    languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "two languages" else ["italian", "chinese"]
    for bool_val in [True, False]:
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(bool_val, 0)
                fig.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if bool_val == True else False,
                    legendgroup=2*e+ee
                ), row=2 if bool_val == True else 3, col=1)
    fig.update_layout(title=f"Accuracy per Class for CASSED's pii Predictions on {dataset}", width=1500, height=700, barmode="stack")
    fig.update_yaxes(title_text="Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Amount of<br>correct predictions", row=2, col=1)
    fig.update_yaxes(title_text="Amount of<br>false predictions", row=3, col=1)
    fig.show()

    fig = make_subplots(
        rows=1, cols=3, shared_yaxes=True,
        subplot_titles=["Accuracy per Language", "Accuracy per Dataset", "Accuracy per Column Name"]
    )
    fig.add_trace(go.Bar(
        x=list(accuracies_lan.keys()),
        y=list(accuracies_lan.values()),
        showlegend=False
    ), row=1, col=1)
    fig.add_trace(go.Bar(
        x=list(accuracies_dat.keys()),
        y=list(accuracies_dat.values()),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Bar(
        x=list(accuracies_col.keys()),
        y=list(accuracies_col.values()),
        showlegend=False
    ), row=1, col=3)
    fig.update_layout(width=900, height=500, title=f"Accuracy of CASSED's pii Predictions on {dataset} with respect to different categories")
    fig.show()

In [10]:
create_analysis_plot(results_df_own_data)

# Test two languages

In [11]:
test_lan = pd.read_csv("../../datasets/test_languages/test.csv")
test_lan_pii = pd.read_csv("../../datasets/test_languages/test_labels_pii.csv")
test_lan_classes = pd.read_csv("../../datasets/test_languages/test_classes.csv")
test_lan_datasets = pd.read_csv("../../datasets/test_languages/test_dataset.csv")

In [12]:
two_lan_true = pd.read_csv("../CASSED_model_results/cassed_pii/two_languages/true.csv")
two_lan_pred = pd.read_csv("../CASSED_model_results/cassed_pii/two_languages/predicted.csv")
two_lan_target_names = pd.read_csv("../CASSED_model_results/cassed_pii/two_languages/target_names.csv")
two_lan_target_names = {a[1]: a[0] for a in two_lan_target_names[["0", "0.1"]].values}
two_lan_target_names[0] = "no prediction"

In [13]:
predictions, true = [], []
for i in range(two_lan_true.shape[0]):
    predictions.append([two_lan_target_names[e] for e, b in enumerate(two_lan_pred.iloc[i,:]) if b == 1])
    true.append([two_lan_target_names[e] for e, b in enumerate(two_lan_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [14]:
plot_cassed_results(true, predictions)

In [15]:
results_df_test_lan = pd.DataFrame({
    "Column": test_lan.columns,
    "Prediction": predictions,
    "True Label": test_lan_pii["label"].values,
    "Classes": test_lan_classes["class"],
    "Dataset": test_lan_datasets["dataset"]
})

In [16]:
#False negatives
fn_df = results_df_test_lan.loc[(results_df_test_lan["True Label"] == "pii") & (results_df_test_lan["Prediction"] == "non-pii")]
fn_df

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset
50,QrS7RPeMOkI,non-pii,pii,full_name_zh,mimesis
74,full_name_zh_1,non-pii,pii,full_name_zh,mimesis
82,5wnRpQ,non-pii,pii,ssn_it,faker
87,ssn_it_1,non-pii,pii,ssn_it,faker


In [17]:
#False positives
results_df_test_lan.loc[(results_df_test_lan["True Label"] == "non-pii") & (results_df_test_lan["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Classes,Dataset
31,YEZAmggQBwBA,pii,non-pii,company_it,mimesis


In [18]:
create_analysis_plot(results_df_test_lan, dataset="two languages")

# Kaggle Datasets

In [19]:
test_kaggle = pd.read_csv("../../datasets/kaggle_datasets/all_datasets.csv")
test_kaggle_pii = pd.read_csv("../../datasets/kaggle_datasets/all_datasets_labels_pii.csv")

In [20]:
kaggle_true = pd.read_csv("../CASSED_model_results/cassed_pii/kaggle/true.csv")
kaggle_pred = pd.read_csv("../CASSED_model_results/cassed_pii/kaggle/predicted.csv")
kaggle_target_names = pd.read_csv("../CASSED_model_results/cassed_pii/kaggle/target_names.csv")
kaggle_target_names = {a[1]: a[0] for a in kaggle_target_names[["0", "0.1"]].values}
kaggle_target_names[0] = "no prediction"

In [21]:
predictions, true = [], []
for i in range(kaggle_true.shape[0]):
    predictions.append([kaggle_target_names[e] for e, b in enumerate(kaggle_pred.iloc[i,:]) if b == 1])
    true.append([kaggle_target_names[e] for e, b in enumerate(kaggle_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [22]:
plot_cassed_results(true, predictions)

In [23]:
folders = [name for name in os.listdir("../../datasets/kaggle_datasets") if os.path.isdir(os.path.join("../../datasets/kaggle_datasets", name))]
dataset_kaggle, dataset_kaggle_info = [], []
dataset_type = ["pii info", "pii info", "non-pii info", "pii info", "pii info", "pii info", "pii info", "non-pii info",
                "non-pii info", "pii info", "pii info", "non-pii info", "pii info", "pii info", "non-pii info"]
for e, folder in enumerate(folders):
    path = "../../datasets/kaggle_datasets/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_kaggle += [folder] * df.shape[1]
    dataset_kaggle_info += [dataset_type[e]] * df.shape[1]


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.



In [24]:
results_df_test_kaggle = pd.DataFrame({
    "Column": test_kaggle.columns,
    "Prediction": predictions,
    "True Label": test_kaggle_pii["label"].values,
    "Dataset": dataset_kaggle,
    "Dataset Type": dataset_kaggle_info
})

In [25]:
#False negatives
fn_df = results_df_test_kaggle.loc[(results_df_test_kaggle["True Label"] == "pii") & (results_df_test_kaggle["Prediction"] == "non-pii")]
fn_df

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
0,ID,non-pii,pii,bank_marketing,pii info
86,Serial No.,non-pii,pii,used_car,pii info
95,id,non-pii,pii,house_price,pii info
223,PassengerId,non-pii,pii,student_performance,pii info
233,Cabin,non-pii,pii,student_performance,pii info


In [26]:
#False positives
results_df_test_kaggle.loc[(results_df_test_kaggle["True Label"] == "non-pii") & (results_df_test_kaggle["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
115,latitude,pii,non-pii,house_price,pii info
139,CORPORATE_IDENTIFICATION_NUMBER,pii,non-pii,absenteeism,non-pii info
153,EMAIL_ADDR,pii,non-pii,titanic,pii info
171,body,pii,non-pii,pixar,pii info
179,director,pii,non-pii,agriculture,non-pii info
180,writer,pii,non-pii,agriculture,non-pii info
181,main_characters,pii,non-pii,agriculture,non-pii info
183,main_voice_actors,pii,non-pii,agriculture,non-pii info


# OpenML

In [27]:
test_openml = pd.read_csv("../../datasets/openml_datasets/all_datasets.csv")
test_openml_pii = pd.read_csv("../../datasets/openml_datasets/all_datasets_labels_pii.csv")

In [28]:
openml_true = pd.read_csv("../CASSED_model_results/cassed_pii/openml/true.csv")
openml_pred = pd.read_csv("../CASSED_model_results/cassed_pii/openml/predicted.csv")
openml_target_names = pd.read_csv("../CASSED_model_results/cassed_pii/openml/target_names.csv")
openml_target_names = {a[1]: a[0] for a in openml_target_names[["0", "0.1"]].values}
openml_target_names[0] = "no prediction"

In [29]:
predictions, true = [], []
for i in range(openml_true.shape[0]):
    predictions.append([openml_target_names[e] for e, b in enumerate(openml_pred.iloc[i,:]) if b == 1])
    true.append([openml_target_names[e] for e, b in enumerate(openml_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [30]:
plot_cassed_results(true, predictions)

In [31]:
folders = [name for name in os.listdir("../../datasets/openml_datasets") if os.path.isdir(os.path.join("../../datasets/openml_datasets", name))]
dataset_openml, dataset_openml_info = [], []
dataset_type = ["pii info", "pii info", "non-pii info", "pii info", "pii info", "pii info", "pii info", "non-pii info",
                "non-pii info", "pii info", "pii info", "non-pii info", "pii info", "pii info", "non-pii info"]
for e, folder in enumerate(folders):
    path = "../../datasets/openml_datasets/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_openml += [folder] * df.shape[1]
    dataset_openml_info += [dataset_type[e]] * df.shape[1]


Columns (9,12,13,14,15) have mixed types. Specify dtype option on import or set low_memory=False.



In [32]:
results_df_test_openml = pd.DataFrame({
    "Column": test_openml.columns,
    "Prediction": predictions,
    "True Label": test_openml_pii["label"].values,
    "Dataset": dataset_openml,
    "Dataset Type": dataset_openml_info
})

In [33]:
#False negatives
results_df_test_openml.loc[(results_df_test_openml["True Label"] == "pii") & (results_df_test_openml["Prediction"] == "non-pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
0,User ID,non-pii,pii,FitBit_HeartRate,pii info
3,Username,non-pii,pii,CSM,pii info
70,CustomerID,non-pii,pii,mango_detection_australia,pii info
110,V1,non-pii,pii,DATASETBANK,non-pii info


In [34]:
#False positives
results_df_test_openml.loc[(results_df_test_openml["True Label"] == "non-pii") & (results_df_test_openml["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
6,Location,pii,non-pii,CSM,pii info
69,Aggregate.Followers,pii,non-pii,mango_detection_australia,pii info
86,Address,pii,non-pii,TVS_Loan_Default,pii info
87,Zip,pii,non-pii,TVS_Loan_Default,pii info
92,Lat,pii,non-pii,TVS_Loan_Default,pii info
105,customer_id,pii,non-pii,TVS_Loan_Default,pii info


# OpenML 2

In [35]:
test_openml_2 = pd.read_csv("../../datasets/openml_datasets_2/all_datasets.csv")
test_openml_2_pii = pd.read_csv("../../datasets/openml_datasets_2/all_datasets_labels_pii.csv")

In [36]:
openml_2_true = pd.read_csv("../CASSED_model_results/cassed_pii/openml_2/true.csv")
openml_2_pred = pd.read_csv("../CASSED_model_results/cassed_pii/openml_2/predicted.csv")
openml_2_target_names = pd.read_csv("../CASSED_model_results/cassed_pii/openml_2/target_names.csv")
openml_2_target_names = {a[1]: a[0] for a in openml_2_target_names[["0", "0.1"]].values}
openml_2_target_names[0] = "no prediction"

In [37]:
predictions, true = [], []
for i in range(openml_2_true.shape[0]):
    predictions.append([openml_2_target_names[e] for e, b in enumerate(openml_2_pred.iloc[i,:]) if b == 1])
    true.append([openml_2_target_names[e] for e, b in enumerate(openml_2_true.iloc[i,:]) if b == 1])
true = [a[0] for a in true]
predictions = [a[0] for a in predictions]

In [38]:
plot_cassed_results(true, predictions)

In [39]:
folders = [name for name in os.listdir("../../datasets/openml_datasets_2") if os.path.isdir(os.path.join("../../datasets/openml_datasets_2", name))]
dataset_openml_2, dataset_openml_2_info = [], []
dataset_type = ["pii info", "pii info", "non-pii info", "pii info", "pii info", "pii info", "pii info", "non-pii info",
                "non-pii info", "pii info", "pii info", "non-pii info", "pii info", "pii info", "non-pii info"]
for e, folder in enumerate(folders):
    path = "../../datasets/openml_datasets_2/" + folder
    csv_file = [f for f in os.listdir(path) if f.endswith('.csv') and 'labels' not in f][0]
    with open(path + "/" + csv_file, 'r') as file:
        first_line = file.readline()
        comma_count = first_line.count(',')
        semicolon_count = first_line.count(';')
        if comma_count > semicolon_count:
            sep = ","
        else:
            sep = ";"
    df = pd.read_csv(path + "/" + csv_file, sep=sep)
    dataset_openml_2 += [folder] * df.shape[1]
    dataset_openml_2_info += [dataset_type[e]] * df.shape[1]

In [40]:
results_df_test_openml_2 = pd.DataFrame({
    "Column": test_openml_2.columns,
    "Prediction": predictions,
    "True Label": test_openml_2_pii["label"].values,
    "Dataset": dataset_openml_2,
    "Dataset Type": dataset_openml_2_info
})

In [41]:
#False negatives
results_df_test_openml_2.loc[(results_df_test_openml_2["True Label"] == "pii") & (results_df_test_openml_2["Prediction"] == "non-pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
91,Speaker Number,non-pii,pii,echoMonths,non-pii info


In [42]:
#False positives
results_df_test_openml_2.loc[(results_df_test_openml_2["True Label"] == "non-pii") & (results_df_test_openml_2["Prediction"] == "pii")]

Unnamed: 0,Column,Prediction,True Label,Dataset,Dataset Type
