In [5]:
import pandas as pd
import json
import numpy as np
from itertools import combinations
from scipy.stats import wilcoxon

In [4]:
models = ["mbert","mbert-nl-clin", "bsc-bio-ehr-es"]
num_folds = 10 
results = []
datasets = ["pharmaconer"]

for model in models:
  for dataset in datasets:
    for fold in range(num_folds):
        with open(f'{model}/{dataset}/result/predict_fold_{fold}_results.json') as f:
            fold_result = json.load(f)
            result = {
              'model': model,
              'dataset': dataset,
              'fold': fold,
              'overall_f1': round(fold_result['predict_overall_f1'], 3),
              'overall_precision': fold_result['predict_overall_precision'],
              'overall_recall': fold_result['predict_overall_recall'],
              'N_precision': fold_result['predict_NORMALIZABLES_f1'],
              'NORMALIZABLES_recall': fold_result['predict_NORMALIZABLES_recall'],
              'NORMALIZABLES_f1': fold_result['predict_NORMALIZABLES_f1'],
              'NORMALIZABLES_precision': fold_result['predict_NORMALIZABLES_precision'],
              'NO_NORMALIZABLES_precision': fold_result['predict_NO_NORMALIZABLES_precision'],
              'NO_NORMALIZABLES_recall': fold_result['predict_NO_NORMALIZABLES_recall'],
              'NO_NORMALIZABLES_f1': fold_result['predict_NO_NORMALIZABLES_f1'],
              'PROTEINAS_precision': fold_result['predict_PROTEINAS_precision'],
              'PROTEINAS_recall': fold_result['predict_PROTEINAS_recall'],
              'PROTEINAS_f1': fold_result['predict_PROTEINAS_f1'],
              'UNCLEAR_precision': fold_result['predict_UNCLEAR_precision'],
              'UNCLEAR_recall': fold_result['predict_UNCLEAR_recall'],
              'UNCLEAR_f1': fold_result['predict_UNCLEAR_f1'],
            }
            results.append(result)
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,model,dataset,fold,overall_f1,overall_precision,overall_recall,N_precision,NORMALIZABLES_recall,NORMALIZABLES_f1,NORMALIZABLES_precision,NO_NORMALIZABLES_precision,NO_NORMALIZABLES_recall,NO_NORMALIZABLES_f1,PROTEINAS_precision,PROTEINAS_recall,PROTEINAS_f1,UNCLEAR_precision,UNCLEAR_recall,UNCLEAR_f1
0,mbert,pharmaconer,0,0.867,0.856612,0.87663,0.904412,0.895942,0.904412,0.913043,0.0,0.0,0.0,0.810479,0.870659,0.839492,0.757576,0.735294,0.746269
1,mbert,pharmaconer,1,0.863,0.85148,0.875543,0.900157,0.895942,0.900157,0.904412,0.083333,0.1,0.090909,0.805339,0.867066,0.835063,0.862069,0.735294,0.793651
2,mbert,pharmaconer,2,0.87,0.868435,0.871739,0.906414,0.896982,0.906414,0.916047,0.0,0.0,0.0,0.820776,0.861078,0.840444,0.793103,0.676471,0.730159
3,mbert,pharmaconer,3,0.862,0.85676,0.867935,0.891693,0.882414,0.891693,0.901169,0.0,0.0,0.0,0.826879,0.869461,0.847636,0.605263,0.676471,0.638889
4,mbert,pharmaconer,4,0.863,0.858836,0.866304,0.894929,0.890739,0.894929,0.89916,0.0,0.0,0.0,0.818078,0.856287,0.836747,0.793103,0.676471,0.730159
5,mbert,pharmaconer,5,0.867,0.861439,0.871739,0.9,0.889698,0.9,0.910543,0.0,0.0,0.0,0.820136,0.868263,0.843514,0.648649,0.705882,0.676056
6,mbert,pharmaconer,6,0.874,0.867702,0.880435,0.907378,0.902185,0.907378,0.912632,1.0,0.1,0.181818,0.820542,0.870659,0.844858,0.833333,0.735294,0.78125
7,mbert,pharmaconer,7,0.87,0.867639,0.872826,0.905956,0.902185,0.905956,0.909759,1.0,0.1,0.181818,0.81954,0.853892,0.836364,0.925926,0.735294,0.819672
8,mbert,pharmaconer,8,0.873,0.867813,0.877717,0.912336,0.904266,0.912336,0.920551,0.0,0.0,0.0,0.818388,0.863473,0.840326,0.757576,0.735294,0.746269
9,mbert,pharmaconer,9,0.867,0.859125,0.875,0.903394,0.900104,0.903394,0.906709,0.166667,0.1,0.125,0.81377,0.863473,0.837885,0.821429,0.676471,0.741935


In [3]:
entity_type = ["NORMALIZABLES", "NO_NORMALIZABLES", "PROTEINAS", "UNCLEAR"]

records = []

for model in result_df["model"].unique():
    for entity in entity_type:
        sub_df = result_df[result_df["model"] == model]

        avg_f1 = sub_df[f"{entity}_f1"].mean()
        std_f1 = sub_df[f"{entity}_f1"].std()
        
        avg_precision = sub_df[f"{entity}_precision"].mean()
        std_precision = sub_df[f"{entity}_precision"].std()
        
        avg_recall = sub_df[f"{entity}_recall"].mean()
        std_recall = sub_df[f"{entity}_recall"].std()

        records.append({
            "model": model,
            "enetity": entity,
            "avg_f1": avg_f1,
            "avg_precision": avg_precision,
            "avg_recall": avg_recall
        })


summary_df = pd.DataFrame(records)
summary_df

Unnamed: 0,model,enetity,avg_f1,avg_precision,avg_recall
0,mbert,NORMALIZABLES,0.902667,0.909402,0.896046
1,mbert,NO_NORMALIZABLES,0.057955,0.225,0.04
2,mbert,PROTEINAS,0.840233,0.817393,0.864431
3,mbert,UNCLEAR,0.740431,0.779803,0.708824
4,mbert-nl-clin,NORMALIZABLES,0.911869,0.922386,0.901665
5,mbert-nl-clin,NO_NORMALIZABLES,0.022857,0.029091,0.02
6,mbert-nl-clin,PROTEINAS,0.849161,0.827235,0.872455
7,mbert-nl-clin,UNCLEAR,0.82204,0.853867,0.794118
8,bsc-bio-ehr-es,NORMALIZABLES,0.933466,0.936374,0.930593
9,bsc-bio-ehr-es,NO_NORMALIZABLES,0.240827,0.197492,0.32


In [5]:
result_df.groupby(['model'], observed = True).agg({'overall_precision':'mean',
                                          'overall_recall':'mean',
                                          'overall_f1':'mean'})

Unnamed: 0_level_0,overall_precision,overall_recall,overall_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bsc-bio-ehr-es,0.881519,0.911304,0.896
mbert,0.861584,0.873587,0.8676
mbert-nl-clin,0.872674,0.88163,0.877


In [11]:
models = result_df["model"].unique()
model_list = models.to_list()
model_names = result_df["model"].unique()

p_matrix = pd.DataFrame(np.ones((len(model_list), len(model_list))), index = model_list, columns = model_list)

for m1, m2 in combinations(model_names, 2):
    f1_1 = result_df[result_df["model"] == m1].sort_values("fold")["overall_f1"].values
    f1_2 = result_df[result_df["model"] == m2].sort_values("fold")["overall_f1"].values

    stat, p = wilcoxon(f1_1, f1_2)
    print(f"{m1} vs {m2} → p-value = {p:.4f}")

    p_matrix.loc[m1, m2] = p
    p_matrix.loc[m2, m1] = p

p_matrix = p_matrix.round(4)
display(p_matrix)

mbert vs mbert-nl-clin → p-value = 0.0020
mbert vs bsc-bio-ehr-es → p-value = 0.0020
mbert-nl-clin vs bsc-bio-ehr-es → p-value = 0.0020


  model_list = models.to_list()


Unnamed: 0,mbert,mbert-nl-clin,bsc-bio-ehr-es
mbert,1.0,0.002,0.002
mbert-nl-clin,0.002,1.0,0.002
bsc-bio-ehr-es,0.002,0.002,1.0


### Cantemist NER dataset

In [6]:
models = ["mbert-nl-clin", "mbert", "bsc_bio_ehr_es"]
num_folds = 10 
results = []
datasets = ["cantemist-ner"]
 
for model in models:
  for dataset in datasets:
    for fold in range(num_folds):
        with open(f'{model}/{dataset}/result/predict_fold_{fold}_results.json') as f:
            fold_result = json.load(f)
            result = {
              'model': model,
              'dataset': dataset,
              'fold': fold,
              'f1': fold_result['predict_f1'],
              'precision': fold_result['predict_precision'],
              'recall': fold_result['predict_recall'],
              'accuracy': fold_result['predict_accuracy']
            }
            results.append(result)
result_df = pd.DataFrame(results)

In [7]:
result_df.groupby(['model'], observed = True).agg({'precision':'mean',
                                          'recall':'mean',
                                          'f1':'mean',
                                          'accuracy':'mean'})

Unnamed: 0_level_0,precision,recall,f1,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bsc_bio_ehr_es,0.823026,0.859066,0.840641,0.991683
mbert,0.809244,0.836151,0.822467,0.990922
mbert-nl-clin,0.814421,0.838626,0.826344,0.991105


In [8]:
model_names = result_df["model"].unique()
model_list = model_names.tolist()
# model_names = result_df["model"].unique()

p_matrix = pd.DataFrame(np.ones((len(model_list), len(model_list))), index = model_list, columns = model_list)

for m1, m2 in combinations(model_names, 2):
    f1_1 = result_df[result_df["model"] == m1].sort_values("fold")["f1"].values
    f1_2 = result_df[result_df["model"] == m2].sort_values("fold")["f1"].values

    stat, p = wilcoxon(f1_1, f1_2)
    print(f"{m1} vs {m2} → p-value = {p:.4f}")

    p_matrix.loc[m1, m2] = p
    p_matrix.loc[m2, m1] = p

p_matrix = p_matrix.round(4)
display(p_matrix)

mbert-nl-clin vs mbert → p-value = 0.0137
mbert-nl-clin vs bsc_bio_ehr_es → p-value = 0.0020
mbert vs bsc_bio_ehr_es → p-value = 0.0020


Unnamed: 0,mbert-nl-clin,mbert,bsc_bio_ehr_es
mbert-nl-clin,1.0,0.0137,0.002
mbert,0.0137,1.0,0.002
bsc_bio_ehr_es,0.002,0.002,1.0
