In [1]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score
import re
from plotly.subplots import make_subplots

In [2]:
df_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test.csv')
labels_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_labels_personal.csv')
classes_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_classes.csv')
multiclass_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_labels_multiclass.csv')
dataset_llm = pd.read_csv('../datasets/dessi-mf/dessi-mf_gpt/test_dataset.csv')

df_test_lan = pd.read_csv('../datasets/test_languages/test.csv')
labels_test_lan = pd.read_csv('../datasets/test_languages/test_labels_personal.csv')
classes_test_lan = pd.read_csv('../datasets/test_languages/test_classes.csv')
multiclass_test_lan = pd.read_csv('../datasets/test_languages/test_labels_multiclass.csv')
dataset_test_lan = pd.read_csv('../datasets/test_languages/test_dataset.csv')

In [3]:
def load_responses(path):
    with open(path, "r") as f:
        response_text = f.read()
    return response_text.split("\n")[1:]

In [4]:
responses_own_dataset = load_responses("gpt_predictions/dessi-mf_results.txt")
responses_test_lan = load_responses("gpt_predictions/test_languages_results.txt")

In [5]:
detected_classes = []
for j in range(len(responses_own_dataset)):
    match1 = re.search(f"('detected_classes'|\"detected_classes\"|detected_classes): \[(.*?)\]", responses_own_dataset[j])
    detected = match1.group(2).replace("'", "").replace("\"", "").replace(", ", ",")
    if "," in detected:
        detected = sorted(detected.split(","))
        detected = ",".join(detected)
    detected_classes.append(detected)
results_own_dataset_multiclass = pd.DataFrame({
    "Name": df_llm.columns,
    "True Label": [",".join(sorted(s.split(","))) for s in multiclass_llm["label"]],
    "Prediction": detected_classes,
    "Classes": classes_llm["class"],
    "Dataset": dataset_llm["dataset"]
})

detected_classes = []
for j in range(len(responses_test_lan)):
    match1 = re.search(f"('detected_classes'|\"detected_classes\"|detected_classes): \[(.*?)\]", responses_test_lan[j])
    detected = match1.group(2).replace("'", "").replace("\"", "").replace(", ", ",")
    if "," in detected:
        detected = sorted(detected.split(","))
        detected = ",".join(detected)
    detected_classes.append(detected)
results_test_lan_multiclass = pd.DataFrame({
    "Name": df_test_lan.columns,
    "True Label": [",".join(sorted(s.split(","))) for s in multiclass_test_lan["label"]],
    "Prediction": detected_classes,
    "Classes": classes_test_lan["class"],
    "Dataset": dataset_test_lan["dataset"]
})

In [6]:
CLASS_NAMES = classes_llm["class"].unique()
CLASS_NAMES = [a.lower().split("_")[0] for a in CLASS_NAMES]

def is_valid_word(word):
    return word.lower().split("_")[0] in CLASS_NAMES

def create_analysis_plot(results_df, title, dataset="own_data"):
    cla_new, lan = [], []
    for i in range(len(results_df["Classes"])):
        if "mixed" in results_df.iloc[i, 3] or "de_DE" in results_df.iloc[i, 3] or "fr_FR" in results_df.iloc[i, 3]:
            cla_new.append(results_df.iloc[i, 3][:-6])
        elif "_en" in results_df.iloc[i, 3] or "_de" in results_df.iloc[i, 3] or "_fr" in results_df.iloc[i, 3] or "_it" in results_df.iloc[i,3
                                                                        ] or "_zh" in results_df.iloc[i,3]:
            cla_new.append(results_df.iloc[i, 3][:-3])
        elif "," in results_df.iloc[i, 3]:
                cla_new.append([','.join(sorted(string.split(','))) for string in [results_df.iloc[i, 3]]][0])
        else:
            cla_new.append(results_df.iloc[i, 3])
            
        if "mixed" in results_df.iloc[i, 3]:
            lan.append("mixed language")
        elif "_fr" == results_df.iloc[i,3][-3:].lower():
            lan.append("french")
        elif "_it" == results_df.iloc[i,3][-3:].lower():
            lan.append("italian")
        elif "_zh" == results_df.iloc[i,3][-3:].lower():
            lan.append("chinese")
        elif "_de" == results_df.iloc[i,3][-3:].lower():
            lan.append("german")
        elif "_en" == results_df.iloc[i,3][-3:].lower():
            lan.append("english")
        else:
            lan.append("dessi data")
    results_df["Classes_new"] = cla_new
    results_df["Language"] = lan
    

    accuracies_lan = {}
    for i in results_df["Language"].unique():
        lan_df = results_df.loc[results_df["Language"] == i]
        accuracies_lan[i] = accuracy_score(lan_df["True Label"], lan_df["Prediction"])
    accuracies_cla = {}
    for i in results_df["Classes_new"].unique():
        cla_df = results_df.loc[results_df["Classes_new"] == i]
        accuracies_cla[i] = accuracy_score(cla_df["True Label"], cla_df["Prediction"])
    accuracies_dat = {}
    for i in results_df["Dataset"].unique():
        dat_df = results_df.loc[results_df["Dataset"] == i]
        accuracies_dat[i] = accuracy_score(dat_df["True Label"], dat_df["Prediction"])
    accuracies_col = {}
    ind = []
    for i in range(results_df.shape[0]):
        if is_valid_word(results_df.iloc[i, 0]):
            ind.append(i)
    accuracies_col["valid_col_names"] = accuracy_score(results_df.iloc[ind, 2], results_df.iloc[ind, 1])
    accuracies_col["invalid_col_names"] = accuracy_score(results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 2], 
                                                        results_df.iloc[[a for a in range(results_df.shape[0]) if a not in ind], 1])

    accuracies_cla = dict(sorted(accuracies_cla.items(), key=lambda item: item[1], reverse=True))
    accuracies_lan = dict(sorted(accuracies_lan.items(), key=lambda item: item[1], reverse=True))
    accuracies_col = dict(sorted(accuracies_col.items(), key=lambda item: item[1], reverse=True))
    accuracies_dat = dict(sorted(accuracies_dat.items(), key=lambda item: item[1], reverse=True))


    fig = make_subplots(rows=3, cols=1, subplot_titles=["Accuracy per Class", "Number of correct predictions per Class", "Number of false predictions per Class"],
                        vertical_spacing=0.065, shared_xaxes=True)
    fig.add_trace(go.Bar(
        x=list(accuracies_cla.keys()),
        y=list(accuracies_cla.values()),
        marker=dict(color="black"),
        showlegend=False
    ))
    
    colors = [
    "#d62728",  # Red (High Saturation)
    "#ff9896",  # Red (Low Saturation)
    "#1f77b4",  # Blue (High Saturation)
    "#aec7e8",  # Blue (Low Saturation)
    "#ff7f0e",  # Orange (High Saturation)
    "#ffbb78",  # Orange (Low Saturation)
    "#2ca02c",  # Green (High Saturation)
    "#98df8a",  # Green (Low Saturation)
    "#9467bd",  # Purple (High Saturation)
    "#c5b0d5",  # Purple (Low Saturation)
    ]

    languages = ['english', 'french', 'german', 'mixed language', 'dessi data'] if dataset != "two_languages" else ["italian", "chinese"]
    for bool_val in [True, False]:
        for e, language in enumerate(languages):
            lan_df = results_df.loc[results_df["Language"] == language]
            accuracies_cla = {}
            for ee, col_valid in enumerate([True, False]):
                percentage_of_this_language = []
                for i in lan_df["Classes_new"].unique():
                    cla_df = lan_df.loc[lan_df["Classes_new"] == i]
                    ind = []
                    for j in range(cla_df.shape[0]):
                        if is_valid_word(cla_df.iloc[j, 0]) == col_valid:
                            ind.append(j)
                    cla_df = cla_df.iloc[ind,:]
                    percentage_of_this_language.append(cla_df.shape[0] / results_df.loc[results_df["Classes_new"] == i].shape[0])
                    accuracies_cla[i] = (cla_df["True Label"] == cla_df["Prediction"]).value_counts(normalize=True).get(bool_val, 0)
                fig.add_trace(go.Bar(
                    x=list(accuracies_cla.keys()),
                    y=[a * b for a, b in zip(list(accuracies_cla.values()), percentage_of_this_language)],
                    marker=dict(color=colors[2*e+ee]),
                    name=f"{language}_" + ("valid" if col_valid == True else "invalid") + "<br>column name",
                    showlegend=True if bool_val == True else False,
                    legendgroup=2*e+ee
                ), row=2 if bool_val == True else 3, col=1)
    fig.update_layout(title=f"Accuracy per Class for GPT's {title}", width=1500, height=700, barmode="stack")
    fig.update_yaxes(title_text="Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Amount of<br>correct predictions", row=2, col=1)
    fig.update_yaxes(title_text="Amount of<br>false predictions", range=[0,1.1], row=3, col=1)
    fig.show()

    fig = make_subplots(
        rows=1, cols=3, shared_yaxes=True,
        subplot_titles=["Accuracy per Language", "Accuracy per Dataset", "Accuracy per Column Name"]
    )
    fig.add_trace(go.Bar(
        x=list(accuracies_lan.keys()),
        y=list(accuracies_lan.values()),
        showlegend=False
    ), row=1, col=1)
    fig.add_trace(go.Bar(
        x=list(accuracies_dat.keys()),
        y=list(accuracies_dat.values()),
        showlegend=False
    ), row=1, col=2)
    fig.add_trace(go.Bar(
        x=list(accuracies_col.keys()),
        y=list(accuracies_col.values()),
        showlegend=False
    ), row=1, col=3)
    fig.update_layout(width=900, height=500, title=f"Accuracy of GPT's {title} with respect to different categories")
    fig.show()

# Dessi-mf

In [7]:
print(f"Overall accuracy of GPT multiclass predictions on dessi-mf: {accuracy_score(results_own_dataset_multiclass['True Label'], results_own_dataset_multiclass['Prediction']):.3f}")
create_analysis_plot(results_own_dataset_multiclass, "multiclass predictions on dessi-mf")

Overall accuracy of GPT multiclass predictions on dessi-mf: 0.940


In [8]:
results_own_dataset_multiclass.loc[results_own_dataset_multiclass["Prediction"] != results_own_dataset_multiclass["True Label"]]

Unnamed: 0,Name,True Label,Prediction,Classes,Dataset,Classes_new,Language
17,dPI1af7W,SWIFT/BIC code,,swift_fr_FR,faker,swift,french
44,name_de_DE_faker,full_name,"academic_degree/title,full_name",name_de_DE,faker,name,german
48,f5GCZdDiGADKdJDRZtUb,version,,version_fr,mimesis,version,french
49,ypqnvPf7C,id_card,,ID_Card,dessi,ID_Card,dessi data
129,R74KPONu3OujCeECOtY,"email,national_identification_number",email,"NIN,Email",dessi,"Email,NIN",dessi data
139,36Ud9sMtwgKGnjQEo2,"email,national_identification_number","email,national_identification_number,passport_...","Email,NIN",dessi,"Email,NIN",dessi data
157,color_de_DE_faker,color,,color_de_DE,faker,color,german
193,X8KMf4djkWNdRfr,"national_identification_number,phone_number",phone_number,"Phone_number,NIN",dessi,"NIN,Phone_number",dessi data
203,color_fr_FR_faker,color,,color_fr_FR,faker,color,french
204,pDwjrm74UWhc,"date,national_identification_number","date,national_identification_number,passport_n...","Date,NIN",dessi,"Date,NIN",dessi data


# Test Languages

In [9]:
print(f"Overall accuracy of GPT multiclass predictions on own dataset: {accuracy_score(results_test_lan_multiclass['True Label'], results_test_lan_multiclass['Prediction']):.3f}")
create_analysis_plot(results_test_lan_multiclass, "multiclass predictions on own dataset", dataset="two_languages")

Overall accuracy of GPT multiclass predictions on own dataset: 0.950


In [10]:
results_test_lan_multiclass.loc[results_test_lan_multiclass["Prediction"] != results_test_lan_multiclass["True Label"]]

Unnamed: 0,Name,True Label,Prediction,Classes,Dataset,Classes_new,Language
26,GWlG6g3Ot1OGM,nationality,gpe,nationality_zh,mimesis,nationality,chinese
65,XPvhsBkDa9U4U,nationality,race,nationality_it,mimesis,nationality,italian
73,Gqi7w4e4pxs,word,,word_zh,mimesis,word,chinese
84,y3c2Eom0,SWIFT/BIC code,,swift_it,faker,swift,italian
94,wt0Y3oobQmzvr3e9Xrw,SWIFT/BIC code,,swift_zh,faker,swift,chinese
