In [1]:
import pandas as pd
import re
import numpy as np

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test.csv')
labels_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_labels_personal.csv')
classes_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_classes.csv')
multiclass_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_labels_multiclass.csv')
dataset_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_dataset.csv')

df_all_data = pd.read_csv('../datasets/dessi-mf/dessi-mf/test.csv')
labels_all_data = pd.read_csv('../datasets/dessi-mf/dessi-mf/test_labels_personal.csv')
classes_all_data = pd.read_csv('../datasets/dessi-mf/dessi-mf/test_classes.csv')
multiclass_all_data = pd.read_csv('../datasets/dessi-mf/dessi-mf/test_labels_multi.csv')
dataset_all_data = pd.read_csv('../datasets/dessi-mf/dessi-mf/test_dataset.csv')

df_test_lan = pd.read_csv('../datasets/test_languages/test.csv')
labels_test_lan = pd.read_csv('../datasets/test_languages/test_labels_personal.csv')
classes_test_lan = pd.read_csv('../datasets/test_languages/test_classes.csv')
multiclass_test_lan = pd.read_csv('../datasets/test_languages/test_labels_multiclass.csv')
dataset_test_lan = pd.read_csv('../datasets/test_languages/test_dataset.csv')

## CASSED

In [3]:
def extract_prediction(i, predictions):
    list_extractor = re.compile(r"\b(\w+(?:-\w+)?)\s*\(\d+(\.\d+)?\)")
    text = predictions.iloc[i, :].values[0]
    matches = list_extractor.findall(text)
    return sorted([match[0] for match in matches])


def convert_predictions(predictions):
    column_names = []
    gold_list = []
    pred_list = []
    mismatch = []
    column_names.append(predictions.columns[0].split(". ")[0].strip())

    height = predictions.shape[0]

    for i in range(height):
        if 'Gold:' in predictions.iloc[i, :].values[0][:10]:
            gold_list.append(extract_prediction(i, predictions))
        elif 'Pred:' in predictions.iloc[i, :].values[0][:10]:
            pred_list.append(extract_prediction(i, predictions))
            if i == height-1:
                mismatch.append("No Mismatch")
                break
            if 'MISMATCH' in predictions.iloc[i+1, :].values[0][:20]:
                mismatch.append("Mismatch")
            else:
                mismatch.append("No Mismatch")
        elif 'MISMATCH' not in predictions.iloc[i, :].values[0][:20]:
            column_names.append(predictions.iloc[i, :].values[0].split('. ')[0].strip())
            
    df = pd.DataFrame({
        'Name': column_names,
        'Gold': gold_list,
        'Pred': pred_list,
        'Mismatch': mismatch
    })
    return df

all_data_multiclass = pd.read_csv('../CASSED/CASSED_model_results/cassed_multiclass/dessi-mf/test.tsv', sep='\t') 
all_data_multiclass_df_cassed = convert_predictions(all_data_multiclass)


two_lan_true = pd.read_csv('../CASSED/CASSED_model_results/cassed_multiclass/two_languages/true.csv')
two_lan_pred = pd.read_csv('../CASSED/CASSED_model_results/cassed_multiclass/two_languages/predicted.csv')
two_lan_target_names = pd.read_csv('../CASSED/CASSED_model_results/cassed_multiclass/two_languages/target_names.csv')       
two_lan_target_names = {a[1]: a[0] for a in two_lan_target_names[["0", "0.1"]].values}
two_lan_target_names[0] = "no prediction"

predictions, true = [], []
for i in range(two_lan_true.shape[0]):
    predictions.append([two_lan_target_names[e] for e, b in enumerate(two_lan_pred.iloc[i,:]) if b == 1])
    true.append([two_lan_target_names[e] for e, b in enumerate(two_lan_true.iloc[i,:]) if b == 1])
    
predictions = [",".join([item.replace("[", "").replace("]", "") for item in sublist]) for sublist in predictions]
true = [",".join([item.replace("[", "").replace("]", "") for item in sublist]) for sublist in true]

results_df_two_lan_cassed = pd.DataFrame({
    "Column": df_test_lan.columns,
    "Prediction": predictions,
    "True Label": true,
    "semantic_classes": classes_test_lan["class"],
    "Classes": multiclass_test_lan["label"],
    "Dataset": dataset_test_lan["dataset"]
})

all_data_multiclass = pd.read_csv('../CASSED/CASSED_model_results/cassed_multiclass/dessi-mf/test.tsv', sep='\t') 
all_data_multiclass_df_cassed = convert_predictions(all_data_multiclass)
results_df_all_data_cassed = pd.DataFrame({
    "Column": df_all_data.columns,
    "Prediction": [",".join([item.replace("[", "").replace("]", "") for item in sublist]) for sublist in all_data_multiclass_df_cassed["Pred"]],
    "True Label": [",".join([item.replace("[", "").replace("]", "") for item in sublist]) for sublist in all_data_multiclass_df_cassed["Gold"]],
    "semantic_classes": classes_all_data["class"],
    "Classes": multiclass_all_data["label"],
    "Dataset": dataset_all_data["dataset"]
})

## GPT

In [4]:
def load_responses(path):
    with open(path, "r") as f:
        response_text = f.read()
    return response_text.split("\n")[1:]

responses_own_dataset = load_responses("../GPT/gpt_predictions/dessi-mf_results.txt")
responses_test_lan = load_responses("../GPT/gpt_predictions/test_languages_results.txt")

detected_classes = []
for j in range(len(responses_own_dataset)):
    match1 = re.search(f"('detected_classes'|\"detected_classes\"|detected_classes): \[(.*?)\]", responses_own_dataset[j])
    detected = match1.group(2).replace("'", "").replace("\"", "").replace(", ", ",")
    if "," in detected:
        detected = sorted(detected.split(","))
        detected = ",".join(detected)
    detected_classes.append(detected)
results_own_dataset_multiclass_gpt = pd.DataFrame({
    "Name": df_llm.columns,
    "True Label": [",".join(sorted(s.split(","))) for s in multiclass_llm["label"]],
    "Prediction": detected_classes,
    "semantic_classes": classes_llm["class"],
    "Classes": classes_llm["class"],
    "Dataset": dataset_llm["dataset"]
})

detected_classes = []
for j in range(len(responses_test_lan)):
    match1 = re.search(f"('detected_classes'|\"detected_classes\"|detected_classes): \[(.*?)\]", responses_test_lan[j])
    detected = match1.group(2).replace("'", "").replace("\"", "").replace(", ", ",")
    if "," in detected:
        detected = sorted(detected.split(","))
        detected = ",".join(detected)
    detected_classes.append(detected)
results_test_lan_multiclass_gpt = pd.DataFrame({
    "Name": df_test_lan.columns,
    "True Label": [",".join(sorted(s.split(","))) for s in multiclass_test_lan["label"]],
    "Prediction": detected_classes,
    "semantic_classes": classes_test_lan["class"],
    "Classes": classes_test_lan["class"],
    "Dataset": dataset_test_lan["dataset"]
})

## Evaluation

In [5]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["Evaluation Metrics for DeSSI-MF", "Evaluation Metrics for Test Languages"])

colors = ['#636efa', '#EF553B']

for e, results in enumerate([results_own_dataset_multiclass_gpt, results_test_lan_multiclass_gpt, results_df_all_data_cassed, results_df_two_lan_cassed]):
    true = [set(a.split(",")) for a in results["True Label"]]
    pred = [set(a.split(",")) for a in results["Prediction"]]
    all_classes = set([item for sublist in true for item in sublist]).union(set([item for sublist in pred for item in sublist]))
    mlb = MultiLabelBinarizer(classes=list(all_classes))
    y_true = mlb.fit_transform(true)
    y_pred = mlb.transform(pred)

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    f1_macro = f1_score(y_true, y_pred, average="macro")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)

    metrics = {
        "weighted Precision": precision,
        "weighted Recall": recall,
        "weighted F1 Score": f1,
        "macro F1 Score": f1_macro,
        "Accuracy": accuracy,
    }

    fig.add_trace(go.Bar(
        x=list(metrics.keys()),
        y=list(metrics.values()), 
        name="GPT-4o" if e < 2 else "CASSED",
        showlegend=True if e%2 == 0 else False,
        legendgroup=int(e/2),
        marker=dict(color=colors[int(np.floor(e/2))])
    ), row=1, col=(e%2)+1)

fig.update_layout(
    title=f"Evaluation Metrics for Multiclass Classification",
    height=350, 
    width=850,
    legend=dict(title="Model")
)

fig.show()

# Evaluation

In [6]:
CLASS_NAMES = results_df_all_data_cassed["semantic_classes"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def analyse_results(results_gpt, results_cassed, dataset="DeSSI-MF", width=1480, height=740, subplot_widths=[0.5,0.3,0.2], ticks=[0, 0.2, 0.4, 0.6, 0.8, 1.0]):
    
    fig1 = make_subplots(rows=2, cols=1, subplot_titles=["GPT-4o Accuracy by Class", "CASSED Accuracy by Class"], vertical_spacing=0.075,
                    shared_xaxes=True, y_title="Accuracy")
    
    fig2 = make_subplots(
    rows=1, cols=3, shared_yaxes=True, column_widths=subplot_widths,
    subplot_titles=["Language", "Original Dataset", "Type of<br>Column Name"]
    )
    
    for enu, (results_df, col, name) in enumerate(
        zip([results_gpt.copy(), results_cassed.copy()], ['#636efa', '#EF553B'], ["GPT-4o", "CASSED"])):
        cla_new, lan = [], []
        for i in range(len(results_df["semantic_classes"])):
            if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
                cla_new.append(results_df.iloc[i, 3][:-6])
            elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                            ] or "_zh" in results_df.iloc[i,3]:
                cla_new.append(results_df.iloc[i, 3][:-3])
            elif "," in results_df.iloc[i, 3]:
                cla_new.append([','.join(sorted(string.split(','))) for string in [results_df.iloc[i, 3]]][0])
            else:
                cla_new.append(results_df.iloc[i, 3])
                
            if "mixed" in results_df.iloc[i, 3]:
                lan.append("mixed language")
            elif "_fr" == results_df.iloc[i,3][-3:].lower():
                lan.append("french")
            elif "_it" == results_df.iloc[i,3][-3:].lower():
                lan.append("italian")
            elif "_zh" == results_df.iloc[i,3][-3:].lower():
                lan.append("chinese")
            elif "_de" == results_df.iloc[i,3][-3:].lower():
                lan.append("german")
            elif "_en" == results_df.iloc[i,3][-3:].lower():
                lan.append("english")
            else:
                lan.append("dessi data")
        results_df["Classes_new"] = cla_new
        results_df["Language"] = lan
        

        accuracies_lan = {}
        for i in results_df["Language"].unique():
            lan_df = results_df.loc[results_df["Language"] == i]
            accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
        accuracies_cla = {}
        for i in results_df["Classes_new"].unique():
            cla_df = results_df.loc[results_df["Classes_new"] == i]
            accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
        accuracies_dat = {}
        for i in results_df["Dataset"].unique():
            dat_df = results_df.loc[results_df["Dataset"] == i]
            accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
        accuracies_col = {}
        ind = []
        for i in range(results_df.shape[0]):
            if is_valid_word(results_df.iloc[i, 0]):
                ind.append(i)
        accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
        accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                            results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

        accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
        accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
        accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
        accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))
        
        colors = [
        "#d62728",  # Red (High Saturation)
        "#ff9896",  # Red (Low Saturation)
        "#1f77b4",  # Blue (High Saturation)
        "#aec7e8",  # Blue (Low Saturation)
        "#ff7f0e",  # Orange (High Saturation)
        "#ffbb78",  # Orange (Low Saturation)
        "#2ca02c",  # Green (High Saturation)
        "#98df8a",  # Green (Low Saturation)
        "#9467bd",  # Purple (High Saturation)
        "#c5b0d5",  # Purple (Low Saturation)
        ]

        languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "Test Languages" else ["italian", "chinese"]
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(True, 0)
                fig1.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if enu == 0 else False,
                    legendgroup=2*e+ee
                ), row=enu+1, col=1)


        fig2.add_trace(go.Bar(
            x=list(accuracies_lan.keys()),
            y=list(accuracies_lan.values()),
            name=name,
            showlegend=True,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=1)
        fig2.add_trace(go.Bar(
            x=list(accuracies_dat.keys()),
            y=list(accuracies_dat.values()),
            name=name,
            showlegend=False,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=2)
        fig2.add_trace(go.Bar(
            x=["meaningful", "random generated"],
            y=list(accuracies_col.values()),
            name=name,
            showlegend=False,
            legendgroup=enu,
            marker=dict(color=col)
        ), row=1, col=3)
    fig1.update_layout(title=f"Accuracy by Semantic Class for Personal Predictions on {dataset}", width=width, height=height, barmode="stack", legend=dict(title="Properties of Column"),
                       yaxis=dict(tickvals=ticks), yaxis2=dict(tickvals=ticks), yaxis3=dict(tickvals=ticks))
    fig1.show()
    fig2.update_layout(width=850, height=350, title=f"Accuracy of Predictions by Language, Dataset, and Column Type on {dataset}", yaxis=dict(title="Accuracy"),
                       legend=dict(title="Model"))
    fig2.show()

In [7]:
analyse_results(results_own_dataset_multiclass_gpt, results_df_all_data_cassed, dataset="DeSSI-MF")

In [8]:
analyse_results(results_test_lan_multiclass_gpt, results_df_two_lan_cassed, dataset="Test Languages", width=900, height=500, subplot_widths=[1/3,1/3,1/3], ticks=[0, 0.25, 0.5, 0.75, 1.0])