In [1]:
import pandas as pd
import re

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

# Evaluation results of multiclass task on all unique data with CASSED

In [2]:
def read_results(file_path):
    data_lines = []
    start_reading = False

    with open(file_path, 'r') as file:
        for line in file:
            if 'precision' in line:
                start_reading = True
                continue
            if start_reading and line.strip():
                data_lines.append(line.strip())
                
    data_lines = data_lines[:-1]
    data = [line.split() for line in data_lines]
    for i in data[-4:]:
        i.remove("avg")

    df = pd.DataFrame(data, columns=['Label', 'Precision', 'Recall', 'F1-Score', 'Support'])

    df['Precision'] = pd.to_numeric(df['Precision'])
    df['Recall'] = pd.to_numeric(df['Recall'])
    df['F1-Score'] = pd.to_numeric(df['F1-Score'])
    df['Support'] = pd.to_numeric(df['Support'])

    return df.reset_index(drop=True)

In [3]:
results_all_data = read_results('../CASSED_model_results/cassed_multiclass/dessi-mf/training.log')

In [4]:
results_all_data

Unnamed: 0,Label,Precision,Recall,F1-Score,Support
0,phone_number,1.0,1.0,1.0,1280
1,national_identification_number,1.0,1.0,1.0,790
2,date,1.0,1.0,1.0,626
3,email,1.0,1.0,1.0,547
4,gender,1.0,1.0,1.0,479
5,company,1.0,1.0,1.0,453
6,integer_number,1.0,1.0,1.0,443
7,color,1.0,1.0,1.0,443
8,float_number,1.0,1.0,1.0,441
9,isbn,1.0,1.0,1.0,441


In [5]:
fig = make_subplots(rows=1, cols=2, column_widths=[0.8, 0.2], shared_yaxes=True, y_title="score", horizontal_spacing=0.025, vertical_spacing=0.3)

colors = ["blue",  "red", "green"]

for e, c in enumerate(["Precision", "Recall", "F1-Score"]):
    fig.add_trace(go.Bar(x=results_all_data["Label"].iloc[:-4], y=results_all_data[c].iloc[:-4], legendgroup=c, showlegend=True, 
                        marker_color = colors[e], name=c), row=1, col=1)
    fig.add_trace(go.Bar(x=results_all_data["Label"].iloc[-4:-1], y=results_all_data[c].iloc[-4:-1], legendgroup=c, showlegend=False, 
                        marker_color = colors[e], name=c), row=1, col=2)
fig.update_layout(title="Results of CASSED MultiClass Classification on dessi-mf", barmode='group', width=1200, height=500, legend_title="metric")
fig.show()

In [6]:
results_all_data.loc[results_all_data["F1-Score"] < 1]

Unnamed: 0,Label,Precision,Recall,F1-Score,Support


In [7]:
def extract_prediction(i, predictions):
    list_extractor = re.compile(r"\b(\w+(?:-\w+)?)\s*\(\d+(\.\d+)?\)")
    text = predictions.iloc[i, :].values[0]
    matches = list_extractor.findall(text)
    return sorted([match[0] for match in matches])


def convert_predictions(predictions):
    column_names = []
    gold_list = []
    pred_list = []
    mismatch = []
    column_names.append(predictions.columns[0].split(". ")[0].strip())

    height = predictions.shape[0]

    for i in range(height):
        if 'Gold:' in predictions.iloc[i, :].values[0][:10]:
            gold_list.append(extract_prediction(i, predictions))
        elif 'Pred:' in predictions.iloc[i, :].values[0][:10]:
            pred_list.append(extract_prediction(i, predictions))
            if i == height-1:
                mismatch.append("No Mismatch")
                break
            if 'MISMATCH' in predictions.iloc[i+1, :].values[0][:20]:
                mismatch.append("Mismatch")
            else:
                mismatch.append("No Mismatch")
        elif 'MISMATCH' not in predictions.iloc[i, :].values[0][:20]:
            column_names.append(predictions.iloc[i, :].values[0].split('. ')[0].strip())
            
    df = pd.DataFrame({
        'Name': column_names,
        'Gold': gold_list,
        'Pred': pred_list,
        'Mismatch': mismatch
    })
    return df

all_data_multiclass = pd.read_csv('../CASSED_model_results/cassed_multiclass/dessi-mf/test.tsv', sep='\t') 
all_data_multiclass_df = convert_predictions(all_data_multiclass)

In [8]:
all_data_multiclass_df

Unnamed: 0,Name,Gold,Pred,Mismatch
0,f4WINK6i,[integer_number],[integer_number],No Mismatch
1,credit_card_number_fr_80,[credit_card_number],[credit_card_number],No Mismatch
2,company_en_266,[company],[company],No Mismatch
3,credit_card_provider_fr_FR_308,[credit_card_provider],[credit_card_provider],No Mismatch
4,MAidccS9b5Stvo,[title],[title],No Mismatch
...,...,...,...,...
14733,blood_type_fr_208,[blood_group],[blood_group],No Mismatch
14734,email_en_50,[email],[email],No Mismatch
14735,z6r0fbtbI3aJtXFu5ON,[title],[title],No Mismatch
14736,hwdskaedozyz,[date],[date],No Mismatch


In [12]:
# all_data_multiclass_df[all_data_multiclass_df['Mismatch']=='No Mismatch']

In [13]:
mlb = MultiLabelBinarizer()
y_true = mlb.fit_transform(all_data_multiclass_df["Gold"])
y_pred = mlb.transform(all_data_multiclass_df["Pred"])

y_true_df = pd.DataFrame(y_true)
y_pred_df = pd.DataFrame(y_pred)

confusion_mat = []
for i in range(len(mlb.classes_)):
    pred_true_compare = y_pred_df.loc[y_pred_df.index.isin(y_true_df.loc[y_true_df[i] == 1].index)]
    arr_label = []
    for j in range(len(mlb.classes_)):
        val = pred_true_compare.loc[:, j].value_counts(normalize=True).get(1)
        arr_label.append(val.item() if val != None else 0)
    confusion_mat.append(arr_label)
fig = go.Figure(go.Heatmap(z=confusion_mat, x=mlb.classes_, y=mlb.classes_, colorscale='Blues'))
fig.update_layout(title_text='Confusion Matrix', height=800, width=800)
fig.update_xaxes(title="Predicted Label")
fig.update_yaxes(title="True Label")
fig.show()

The light gray fields are caused by multilabel columns, but the predictions are still correct because the gray fields are symmetrical

In [15]:
all_data_multiclass_df.loc[all_data_multiclass_df["Mismatch"] == "Mismatch"]

Unnamed: 0,Name,Gold,Pred,Mismatch


# Evaluation on test two languages

In [16]:
two_lan_data = pd.read_csv("../../datasets/test_languages/test.csv")
two_lan_labels = pd.read_csv("../../datasets/test_languages/test_labels_multiclass.csv")
two_lan_classes = pd.read_csv("../../datasets/test_languages/test_classes.csv")
two_lan_dataset = pd.read_csv("../../datasets/test_languages/test_dataset.csv")

two_lan_true = pd.read_csv('../CASSED_model_results/cassed_multiclass/two_languages/true.csv')
two_lan_pred = pd.read_csv('../CASSED_model_results/cassed_multiclass/two_languages/predicted.csv')
two_lan_target_names = pd.read_csv('../CASSED_model_results/cassed_multiclass/two_languages/target_names.csv')       
two_lan_target_names = {a[1]: a[0] for a in two_lan_target_names[["0", "0.1"]].values}
two_lan_target_names[0] = "no prediction"

In [17]:
predictions, true = [], []
for i in range(two_lan_true.shape[0]):
    predictions.append([two_lan_target_names[e] for e, b in enumerate(two_lan_pred.iloc[i,:]) if b == 1])
    true.append([two_lan_target_names[e] for e, b in enumerate(two_lan_true.iloc[i,:]) if b == 1])

In [18]:
def plot_cassed_results(true, pred):
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(true)
    y_pred = mlb.transform(pred)

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    f1_macro = f1_score(y_true, y_pred, average="macro")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted")

    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=["Evaluation Metrics", "Confusion Matrix"]
    )

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "macro F1 Score": f1_macro,
        "Accuracy": accuracy,
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        showlegend=False
    ), row=1, col=1)
    
    y_true_df = pd.DataFrame(y_true)
    y_pred_df = pd.DataFrame(y_pred)
    confusion_mat = []
    for i in range(len(mlb.classes_)):
        pred_true_compare = y_pred_df.loc[y_pred_df.index.isin(y_true_df.loc[y_true_df[i] == 1].index)]
        arr_label = []
        for j in range(len(mlb.classes_)):
            val = pred_true_compare.loc[:, j].value_counts(normalize=True).get(1)
            arr_label.append(val.item() if val != None else 0)
        confusion_mat.append(arr_label)
    fig.add_trace(go.Heatmap(z=confusion_mat, x=mlb.classes_, y=mlb.classes_, colorscale='Blues'), row=1, col=2)

    fig.update_layout(
        title="Confusion Matrix and Evaluation Metrics",
        height=500, 
        width=1000,
        showlegend=False
    )

    fig.show()
plot_cassed_results(true, predictions)


unknown class(es) ['EAN_code', 'no prediction'] will be ignored



In [19]:
results_df_two_lan = pd.DataFrame({
    "Column": two_lan_data.columns,
    "Prediction": predictions,
    "True Label": true,
    "Classes": two_lan_classes["class"],
    "Dataset": two_lan_dataset["dataset"]
})

In [20]:
CLASS_NAMES = results_df_two_lan["Classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def create_analysis_plot(results_df, dataset="dessi-mf"):
    cla_new, lan = [], []
    for i in range(len(results_df["Classes"])):
        if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
            cla_new.append(results_df.iloc[i, 3][:-6])
        elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                        ] or "_zh" in results_df.iloc[i,3]:
            cla_new.append(results_df.iloc[i, 3][:-3])
        else:
            cla_new.append(results_df.iloc[i, 3])
            
        if "mixed" in results_df.iloc[i, 3]:
            lan.append("mixed language")
        elif "_fr" == results_df.iloc[i,3][-3:].lower():
            lan.append("french")
        elif "_it" == results_df.iloc[i,3][-3:].lower():
            lan.append("italian")
        elif "_zh" == results_df.iloc[i,3][-3:].lower():
            lan.append("chinese")
        elif "_de" == results_df.iloc[i,3][-3:].lower():
            lan.append("german")
        elif "_en" == results_df.iloc[i,3][-3:].lower():
            lan.append("english")
        else:
            lan.append("dessi data")
    results_df["Classes_new"] = cla_new
    results_df["Language"] = lan
    

    accuracies_lan = {}
    for i in results_df["Language"].unique():
        lan_df = results_df.loc[results_df["Language"] == i]
        mlb = MultiLabelBinarizer()
        y_true = mlb.fit_transform(lan_df["True Label"])
        y_pred = mlb.transform(lan_df["Prediction"])
        accuracies_lan[i] = accuracy_score(y_true, y_pred)
    accuracies_cla = {}
    for i in results_df["Classes_new"].unique():
        cla_df = results_df.loc[results_df["Classes_new"] == i]
        mlb = MultiLabelBinarizer()
        y_true = mlb.fit_transform(cla_df["True Label"])
        y_pred = mlb.transform(cla_df["Prediction"])
        accuracies_cla[i] = accuracy_score(y_true, y_pred)
    accuracies_dat = {}
    for i in results_df["Dataset"].unique():
        dat_df = results_df.loc[results_df["Dataset"] == i]
        mlb = MultiLabelBinarizer()
        y_true = mlb.fit_transform(dat_df["True Label"])
        y_pred = mlb.transform(dat_df["Prediction"])
        accuracies_dat[i] = accuracy_score(y_true, y_pred)
    accuracies_col = {}
    ind = []
    for i in range(results_df.shape[0]):
        if is_valid_word(results_df.iloc[i, 0]):
            ind.append(i)
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(results_df.iloc[ind, 2])
    y_pred = mlb.transform(results_df.iloc[ind, 1])
    accuracies_col["valid_col_names"] = accuracy_score(y_true, y_pred)
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2])
    y_pred = mlb.transform(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])
    accuracies_col["invalid_col_names"] = accuracy_score(y_true, y_pred)

    accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
    accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
    accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
    accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))

    fig = make_subplots(rows=3, cols=1, subplot_titles=["Accuracy per Class", "Number of correct predictions per Class", "Number of false predictions per Class"],
                    shared_xaxes=True)
    fig.add_trace(go.Bar(
        x=list(accuracies_cla.keys()),
        y=list(accuracies_cla.values()),
        marker=dict(color="black"),
        showlegend=False
    ))
    colors = [
        "#d62728",  # Red (High Saturation)
        "#ff9896",  # Red (Low Saturation)
        "#1f77b4",  # Blue (High Saturation)
        "#aec7e8",  # Blue (Low Saturation)
        "#ff7f0e",  # Orange (High Saturation)
        "#ffbb78",  # Orange (Low Saturation)
        "#2ca02c",  # Green (High Saturation)
        "#98df8a",  # Green (Low Saturation)
        "#9467bd",  # Purple (High Saturation)
        "#c5b0d5",  # Purple (Low Saturation)
        ]

    languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "two languages" else ["italian", "chinese"]
    for bool_val in [True, False]:
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(bool_val, 0)
                fig.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if bool_val == True else False,
                    legendgroup=2*e+ee
                ), row=2 if bool_val == True else 3, col=1)
    fig.update_layout(title="Accuracy per Class for CASSED's PII Predictions on dessi-mf", width=1500, height=700, barmode="stack")
    fig.update_yaxes(title_text="Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Amount of<br>correct predictions", row=2, col=1)
    fig.update_yaxes(title_text="Amount of<br>false predictions", row=3, col=1)
    fig.show()

    fig = make_subplots(
        rows=1, cols=3, shared_yaxes=True,
        subplot_titles=["Accuracy per Language", "Accuracy per Dataset", "Accuracy per Column Name"]
    )
    fig.add_trace(go.Bar(
        x=list(accuracies_lan.keys()),
        y=list(accuracies_lan.values()),
        showlegend=False
    ), row=1, col=1)
    fig.add_trace(go.Bar(
        x=list(accuracies_dat.keys()),
        y=list(accuracies_dat.values()),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Bar(
        x=list(accuracies_col.keys()),
        y=list(accuracies_col.values()),
        showlegend=False
    ), row=1, col=3)
    fig.update_layout(width=900, height=500, title="Accuracy of CASSED's PII Predictions on dessi-mf with respect to different categories")
    fig.show()

In [21]:
create_analysis_plot(results_df_two_lan, dataset="two languages")


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['EAN_code', 'no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['gender', 'no prediction'] will be ignored


unknown class(es) ['first_name', 'no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['last_name', 'no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['no predictio

Comparison of results on both datasets

In [22]:
all_data_df = all_data_multiclass_df.rename(columns={"Name": "Column", "Gold": "True Label", "Pred": "Prediction"}).drop(columns="Mismatch")
all_data_df = all_data_df.iloc[:, [0,2,1]]
all_data_df["Classes"] = pd.read_csv("../../datasets/dessi-mf/dessi-mf/test_classes.csv")["class"].values
all_data_df = pd.concat([all_data_df, results_df_two_lan.iloc[:, [0,1,2,3]]], axis=0)


fig = make_subplots(rows=1, cols=2, shared_yaxes=True, y_title="score", subplot_titles=["Evaluation Metrics by dataset", "Accuracies by language"])
for true, pred, name in zip([all_data_multiclass_df.iloc[:,1], results_df_two_lan.iloc[:,2]], [all_data_multiclass_df.iloc[:,2], results_df_two_lan.iloc[:,1]], 
                      ["dessi-mf", "two languages"]):
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(true)
    y_pred = mlb.transform(pred)

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    f1_macro = f1_score(y_true, y_pred, average="macro")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted")

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "macro F1 Score": f1_macro,
        "Accuracy": accuracy,
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        name=name,
        legendgroup=name,
    ), row=1, col=1)


lan = []
for i in range(len(all_data_df["Classes"])):
    if "mixed" in all_data_df.iloc[i, 3]:
        lan.append("mixed language")
    elif "_fr" == all_data_df.iloc[i,3][-3:].lower():
        lan.append("french")
    elif "_it" == all_data_df.iloc[i,3][-3:].lower():
        lan.append("italian")
    elif "_zh" == all_data_df.iloc[i,3][-3:].lower():
        lan.append("chinese")
    elif "_de" == all_data_df.iloc[i,3][-3:].lower():
        lan.append("german")
    elif "_en" == all_data_df.iloc[i,3][-3:].lower():
        lan.append("english")
    else:
        lan.append("dessi data")
all_data_df["Language"] = lan


accuracies_lan = {}
for i in all_data_df["Language"].unique():
    lan_df = all_data_df.loc[all_data_df["Language"] == i]
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(lan_df["True Label"])
    y_pred = mlb.transform(lan_df["Prediction"])
    accuracies_lan[i] = accuracy_score(y_true, y_pred)

colors = ["#636EFA", "#EF553B"]
fig.add_trace(go.Bar(
    x=["english", "french", "german", "mixed language", "dessi data"],
    y=[accuracies_lan[key] for key in ["english", "french", "german", "mixed language", "dessi data"]],
    legendgroup="dessi-mf",
    showlegend=False,
    marker=dict(color=colors[0])
), row=1, col=2)

fig.add_trace(go.Bar(
    x=[ "italian", "chinese"],
    y=[accuracies_lan[key] for key in ["italian", "chinese"]],
    legendgroup="two languages",
    showlegend=False,
    marker=dict(color=colors[1])
), row=1, col=2)

fig.update_layout(
    title="Evaluation of CASSED MultiClass Classification",
    height=350, 
    width=800,
)

fig.show()


unknown class(es) ['EAN_code', 'no prediction'] will be ignored


unknown class(es) ['no prediction'] will be ignored


unknown class(es) ['EAN_code', 'no prediction'] will be ignored

