# Este cuaderno computa métricas en función de un corte óptimo de probabilidad para cada  modelo

In [2]:
## Librerias necesarias
import pycaret
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
import tensorflow as tf

import os
import socket
import joblib
import warnings
import numpy as np
import pandas as pd
from scipy.stats import sem
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

## Configuración de rutas
if socket.gethostname()=='SRVCBECO01':
    os.chdir(r'C:\Users\juanpablo.martinez\Downloads\ryr_exes')
elif socket.gethostname() == 'HPJP2': 
    os.chdir(r'C:\Users\juanm\Dropbox\Vías clinicas diabetes')
elif socket.gethostname() == 'CNF106054': 
    os.chdir(r'C:\Users\paul.rodriguez\Dropbox\Salud Colombia\Diabetes Sanitas\Vías clinicas diabetes')
elif socket.gethostname() == 'CNF77701': 
    os.chdir(r'C:\Users\juanpablo.martinez\Dropbox\Vías clinicas diabetes')

    
raw_path = ''
created_path = ""
temp_path = ''
dbs_path = ""
models_path = temp_path+"ryr_results\\baseline_replicate\\models\\"


print('Directorio actual: '+ os.getcwd())

Directorio actual: C:\Users\juanpablo.martinez\Downloads\ryr_exes


### Server packages versions

In [3]:
! pip list

Package                            Version
---------------------------------- --------------------
absl-py                            1.2.0
alabaster                          0.7.12
alembic                            1.11.2
anaconda-client                    1.9.0
anaconda-navigator                 2.1.2
anaconda-project                   0.10.1
ansi2html                          1.8.0
anyio                              2.2.0
appdirs                            1.4.4
argh                               0.26.2
argon2-cffi                        20.1.0
arrow                              0.13.1
asn1crypto                         1.4.0
astroid                            2.6.6
astropy                            4.3.1
astunparse                         1.6.3
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              21.2.0
autopep8                           1.5.7
Babel                              2.9.1
backcall                          



## Funciones - Otros modelos

#### Identifcando variables categóricas y numéricas en la base de datos

In [4]:
def cat_num_identify(ml_data, rename_df):
    categorical_vars_master = ['femenino', 'ERC_high', 'ERC1', 'ERC2', 'adhiere_guia', 'no_adhiere', 'adhiere_colesterol', 'analgesicos',
                                   'antiacidos', 'antihipertensivos', 'hipoglicemiantes', 'hipolipemiantes', 'nutrition_tag_max', 
                                   'exercise_tag_max', 'alcohol_tag_max', 'tobaco_tag_max', 'creatinina_missing']

    categorical_vars = list(set(ml_data.columns).intersection(categorical_vars_master))

    numerical_vars = set(ml_data.columns).difference(set(categorical_vars+['KeyAnonimo', 'year', 'base_label', 'comorbilidades', 'fuera_metas', 'tgt_label']))
    numerical_vars = list(numerical_vars)

    types_df = pd.DataFrame({'old_name' : categorical_vars+numerical_vars,
                             'type' : ['categorical']*len(categorical_vars)+['numerical']*len(numerical_vars)})

    types_df = types_df.merge(rename_df, on = 'old_name', how = 'inner')

    new_cat_vars = list(types_df.loc[types_df['type'] == 'categorical', 'new_name'].values)
    new_num_vars = list(types_df.loc[types_df['type'] == 'numerical', 'new_name'].values)
    
    return new_cat_vars, new_num_vars

#### Partición entre variables exógenas y endógenas

In [5]:
def endog_exog_select(data_mat, dep_var):
    X = data_mat.drop(dep_var, axis = 1)
    Y = data_mat.loc[:, dep_var]
    return(X, Y)

#### Preparación de las bases de entrenamiento y validación

In [6]:
## Función para cargar y eliminar variables no deseadas
def data_clean(df_path, dep_var):
    
    ## Listado de variables a eliminar 
    vars_to_drop = ['KeyAnonimo', 'year', 'base_label', 'tgt_label']
    if dep_var == "comorbilidades":
        vars_to_drop = vars_to_drop+['fuera_metas']
    elif dep_var == "fuera_metas":
        vars_to_drop = vars_to_drop+['comorbilidades']
    
    ## Cargando la base de datos
    data_mat = pd.read_csv(df_path)
    
    ## Eliminando variables innecesarias
    data_mat.drop(vars_to_drop, axis = 1, inplace = True)
    
    return(data_mat.reset_index(drop = True))

def data_prepare_wrapper(tgt_year, base_label, dep_var, model_type):
    ## Construyendo los paths de entrenamiento y validación
    train_path = dbs_path+"train\\train_db_{}_years_base_{}.csv".format(tgt_year, base_label)
    vali_path = dbs_path+"vali\\vali_db_{}_years_base_{}.csv".format(tgt_year, base_label)

    ## Diccionario para renombrar variables
    rename_dict ={'femenino' : 'Female', 'edad' : 'Age (Years)', 'peso': 'Weight (Kg)', 'talla' : 'Height (m)', 
                  'imc' : "BMI (kg/m*m)", 'Colesterol_LDL' : 'LDL Chol. (mg/dL)', 'TFG' : "eGFR (mg/g)", 
                  "ta_diastolica" : "Diast. B.P. (mmHg)", 'ta_sistolica' : 'Sist. B.P. (mmHg)', 'adhiere_guia' : "Hba1c guide Adh.", 
                  "no_adhiere" : "Pharma. Adh.", "analgesicos" : "Analgesics", "antiacidos" : "Antacdis",
                  "antihipertensivos" : "Antihypertensive", "hipoglicemiantes" : "Hypoglecimic agents", 
                  "hipolipemiantes" : "Lipid-lowering agents", "nutrition_tag_max" : "Nutrition recomm.", 
                  "exercise_tag_max" : "Physical act. recomm.", "alcohol_tag_max" : "Alcohol recomm.",
                  "tobaco_tag_max" : "Tobacco recomm.", "creatinina" : "Creatinine (mg/dL)", 
                  "adhiere_colesterol" : "Chol. Adh."} 

    rename_df = pd.DataFrame(rename_dict, index = rename_dict.keys())
    rename_df = pd.DataFrame({'old_name' : rename_df.index, 'new_name' : np.diag(rename_df)})
    
    ## Preparando los datos
    train = data_clean(df_path = train_path, dep_var = dep_var)
    vali = data_clean(df_path = vali_path, dep_var = dep_var)

    ## Identificando variables categóricas y numéricas presentes
    cat_vars, num_vars = cat_num_identify(ml_data = train, rename_df = rename_df)

    ## Renombrando las variables para el gráfico
    train.rename(rename_dict, axis = 1, inplace = True)
    vali.rename(rename_dict, axis = 1, inplace = True)
    
    if model_type != "nn":
        return train, vali, cat_vars, num_vars
    
    elif model_type == "nn":
        ## Segmentación entre variables endógenas y exógenas 
        X_train, Y_train = endog_exog_select(data_mat = train,
                                                 dep_var = 'fuera_metas')
                                                 #dep_var = 'comorbilidades')
        X_vali, Y_vali = endog_exog_select(data_mat = vali, 
                                           dep_var = 'fuera_metas')
                                        #   dep_var = 'comorbilidades')

        return X_train, Y_train, X_vali, Y_vali, cat_vars, num_vars

## Cómputo de las probabilidades Pycaret

In [6]:
def pycaret_prob_compute(dep_var, tgt_year, base_label):
    ## Construyendo el nombre del modelo
    model_name = "{}_{}_years_base_{}".format(dep_var, tgt_year, base_label)

    ## Load the the trained model
    trained_model = load_model(models_path+"{}".format(model_name))
    model_type = str(type(trained_model[6])).split(".")[-1:][0].split("'")[0]

    ## Organize the data for predicting on both training and validation
    train, vali, cat_vars, num_vars = data_prepare_wrapper(tgt_year = tgt_year, 
                                                          base_label = base_label,
                                                          dep_var = dep_var,
                                                          model_type = model_type)

    ## Pycaret setup
    s = setup(data = train,
              target = dep_var,
              test_data = vali,
              session_id = 123,
              index = False,
              numeric_features = num_vars,
              categorical_features = cat_vars,
              remove_multicollinearity = True,
              fix_imbalance = False,
               #feature_selection = False,
              feature_selection = True,
              feature_selection_method = "sequential",
              n_features_to_select = 0.5,
              use_gpu = False, 
             verbose = 0)

    ## Compute training and validation probabilities
    train_prs = predict_model(trained_model[6], 
                            data = train,
                            raw_score = True,
                            verbose = 0)["prediction_score_1"]

    vali_prs = predict_model(trained_model[6], 
                             data = vali, 
                             raw_score = True)['prediction_score_1']

    ## Generate a dataframe with predicted probabilities and labels for the specific model
    pred_res = pd.concat([pd.DataFrame({"sample": np.repeat("training", len(train_prs)), 
                             "probabilities" : train_prs, 
                             "labels" : train[dep_var]}), 
                           pd.DataFrame({"sample": np.repeat("validation", len(vali_prs)), 
                                         "probabilities" : vali_prs, 
                                         "labels" : vali[dep_var]})], 
                          axis = 0)

    pred_res['dep_var'] = dep_var
    pred_res['base_label'] = base_label
    pred_res['tgt_year'] = tgt_year
    pred_res['model_type'] = model_type

    # Export the probabilities
    pred_res.to_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\{}.csv".format(model_name), 
                    sep = ';', 
                    index = False)

### Iterando entre modelos para generar las tablas de probabilidades faltantes

In [7]:
## Iterate across the specifications
for dep_var in ["comorbilidades", "fuera_metas"]:
    for tgt_year in [1, 2]:
        
        # Escogiendo el iterador de estadio base dependiendo de la variable a graficar
        if dep_var == "comorbilidades":
            base_iterator = [1, 2]
        elif dep_var == "fuera_metas":
            base_iterator = [2,4]
            
        for base_label in base_iterator:
            pycaret_prob_compute(dep_var = dep_var, tgt_year = tgt_year, base_label = base_label)

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.6981,0.6187,0.933,0.7019,0.8011,0.2239,0.2684


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.6963,0.5972,0.9179,0.7106,0.8011,0.2014,0.2343


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9034,0.5044,1.0,0.9034,0.9492,0.0,0.0


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9086,0.5321,1.0,0.9086,0.9521,0.0,0.0


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7216,0.5499,1.0,0.7216,0.8383,0.0,0.0


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7355,0.5258,1.0,0.7355,0.8476,0.0,0.0


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.6407,0.5178,0.9349,0.6627,0.7756,-0.0078,-0.0121


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.668,0.5067,0.9985,0.6687,0.801,-0.0021,-0.0226


### Iterando entre las tablas de probabilidades para computar métrics

In [8]:
all_stats = []

## Iterate across models
for dep_var in ["comorbilidades", "fuera_metas"]:
    for tgt_year in [1, 2]:
        # Escogiendo el iterador de estadio base dependiendo de la variable a graficar
        if dep_var == "comorbilidades":
            base_iterator = [1, 2]
        elif dep_var == "fuera_metas":
            base_iterator = [1, 2, 3, 4]
            
        for base_label in base_iterator:
    
            ## Construyendo el nombre del modelo y cargando las probabilidades y etiquetas
            file = "{}_{}_years_base_{}.csv".format(dep_var, tgt_year, base_label)
            model_prs = pd.read_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\"+file,
                                    sep = ';')
            # Isolate the model type, training and vali probabilities
            model_type = model_prs.loc[0, "model_type"]

            train_prs = model_prs.loc[model_prs['sample'] == "training", "probabilities"]
            vali_prs = model_prs.loc[model_prs['sample'] == "validation", "probabilities"]

            train_labs = model_prs.loc[model_prs['sample'] == "training", "labels"]
            vali_labs = model_prs.loc[model_prs['sample'] == "validation", "labels"]

            # Iterate across probability thresholds
            stats_res = []
            for p_val in np.arange(0, 1, 0.001):

                # Generate the labels for the specific probability threshold
                predicted_labels = np.where(train_prs>p_val, 1, 0)

                # Isolate the confusion matrix entries
                tn, fp, fn, tp = confusion_matrix(y_true = train_labs, 
                                                  y_pred =  predicted_labels).ravel()

                # Gather the results in a specific dataframe
                train_res = pd.DataFrame({'p_val': p_val, 
                                          "sample" : ['training'], 
                                          "tp" : [tp], "tn" : [tn], "fp" : [fp], "fn" : [fn], 
                                          "f1" : [2*tp/(2*tp+fp+fn)], 
                                          "accuracy" : [(tp + tn)/(tp+tn+fp+fn)], 
                                          "kappa" : [2*(tp*tn-fn*fp)/((tp+fp)*(fp+tn)+(tp+fn)*(fn+tn))], 
                                          "auc" : [roc_auc_score(train_labs, train_prs)], 
                                          "sens" : [tp/(tp+fn)], 
                                          "spec" : [tn/tn+fp]})

                # Repeat the process for validation
                predicted_labels = np.where(vali_prs>p_val, 1, 0)
                tn, fp, fn, tp = confusion_matrix(y_true = vali_labs, 
                                                  y_pred =  predicted_labels).ravel()
                vali_res = pd.DataFrame({'p_val': p_val, 
                                          "sample" : ['validation'], 
                                          "tp" : [tp], "tn" : [tn], "fp" : [fp], "fn" : [fn], 
                                          "f1" : [2*tp/(2*tp+fp+fn)], 
                                          "accuracy" : [(tp + tn)/(tp+tn+fp+fn)], 
                                          "kappa" : [2*(tp*tn-fn*fp)/((tp+fp)*(fp+tn)+(tp+fn)*(fn+tn))], 
                                          "auc" : [roc_auc_score(vali_labs, vali_prs)], 
                                          "sens" : [tp/(tp+fn)], 
                                          "spec" : [tn/tn+fp]})

                stats_res.append(pd.concat([train_res, vali_res], axis = 0))

            stats_res = pd.concat(stats_res, axis = 0).reset_index(drop = True)
            stats_res['dep_var'] = dep_var
            stats_res['base_label'] = base_label
            stats_res['tgt_year'] = tgt_year

            # Generate maximum kappa indicator and minimum difference dummy
            stats_res.loc[stats_res.groupby("sample").idxmax().reset_index().loc[:, 'kappa'], "max_kappa"] = 1
            stats_res['sens_spec_diff'] = stats_res['sens']-stats_res['spec']
            stats_res.loc[stats_res.groupby("sample").idxmin().reset_index().loc[:, 'sens_spec_diff'], "min_diff"] = 1

            stats_res.loc[:, "Model"] = model_type

            all_stats.append(stats_res)

## Reorganizando la tabla como en el paper

In [18]:
# Append all the statistics
all_stats = pd.concat(all_stats, axis = 0).reset_index(drop = True)

# Export all the statistics
all_stats.to_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\{}.csv".format("stats_res"), 
                 sep = ';', 
                 index = False)

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [19]:
ordered_results = pd.read_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\{}.csv".format("stats_res"), 
                        sep = ';')

# Keep optimal model and naive model, reorder columns
ordered_results = ordered_results.loc[(ordered_results['p_val'] == 0.5) | (ordered_results['max_kappa'] == 1) | (ordered_results['min_diff'] == 1), 
                                      ["p_val", "max_kappa", "min_diff", "tgt_year", "base_label", "dep_var", 'Model', "sample", "accuracy", "f1", "auc"]]

# Change the variables' labels
ordered_results["tgt_year"].replace(to_replace = [1, 2], 
                                      value = ["1-year", 
                                               "2-year"], 
                                        inplace = True)

ordered_results["base_label"].replace(to_replace = [1, 2, 3, 4], 
                                      value = ["1 On-goals", 
                                                 "2 Out-goals", 
                                                 "3 Not-complications", 
                                                 "4 Yes-complications"], 
                                        inplace = True)

ordered_results["dep_var"].replace(to_replace = ["fuera_metas", "comorbilidades"], 
                                      value = ["Out-goals", "Yes-complications"], 
                                        inplace = True)

ordered_results["sample"].replace(to_replace = ["training", "validation"], 
                                      value = ["Training", "Validation"],
                                        inplace = True)

ordered_results.replace(to_replace = ['LogisticRegression', 'AdaBoostClassifier', 'GradientBoostingClassifier', 'LGBMClassifier'], 
                        value = ["LR", "AdaBoost", "GBC", "LGBoost"], 
                        inplace = True)

# Export long version of the results
ordered_results.to_csv(temp_path + "kappa_opt_long_res.csv", sep = ';', index = False)

In [20]:
# Renaming different columns
rename_dict = {"tgt_year" : "Horizon", 
               "base_label" : "Initial condition",
               "dep_var" : "Prediction", 
               "accuracy" : "Accuracy",
               "f1" : "F1-score", 
               "auc" : "AUC"}

## Generate results for naive model
subsample = pd.pivot(
    ordered_results[ordered_results['p_val'] == 0.5].rename(rename_dict, axis = 1), 
    index = ["p_val", 'Horizon', 'Initial condition',  'Prediction', 'Model'], 
    
    columns = "sample", 
    values = ['Accuracy', 
               'F1-score', 
               'AUC']
).reset_index()

# Rows sorting
subsample = pd.concat([subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "1-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "1-year")],
           subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "2-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "2-year")]])

# Final cleaning
subsample['Initial condition'] = subsample['Initial condition'].str.replace('\d+', '')

## Reorganise the columns
#subsample.loc[:, "index"] = [2, 4, 5, 6, 9, 10, 11, 12]
print("Results under naive cutoff:")
paper_table = subsample.loc[:, [#('index',''), 
                  ('Horizon',''),
                    ('Initial condition',''),
                    ('Prediction',''),
                    ('Model',''),
                    ("Accuracy", "Training"), 
                    ("F1-score", "Training"), 
                    ("AUC", "Training"),
                    ("Accuracy", "Validation"), 
                    ("F1-score", "Validation"), 
                    ("AUC", "Validation")]].round(4)


paper_table.to_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\naive_metrics.csv", 
                   sep = ';', 
                   index = False)

paper_table

Results under naive cutoff:


Unnamed: 0_level_0,Horizon,Initial condition,Prediction,Model,Accuracy,F1-score,AUC,Accuracy,F1-score,AUC
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Training,Training,Training,Validation,Validation,Validation
0,1-year,On-goals,Out-goals,NN,0.8758,0.4944,0.9185,0.799,0.0906,0.5654
2,1-year,Out-goals,Out-goals,LR,0.7223,0.8388,0.5457,0.7216,0.8383,0.5499
4,1-year,Not-complications,Out-goals,NN,0.8334,0.2481,0.8028,0.7977,0.0826,0.5745
5,1-year,Yes-complications,Out-goals,LR,0.7358,0.8478,0.5545,0.7355,0.8476,0.5258
1,1-year,On-goals,Yes-complications,AdaBoost,0.7252,0.8183,0.6842,0.6989,0.8015,0.6187
3,1-year,Out-goals,Yes-complications,GBC,0.736,0.8283,0.7485,0.6963,0.8011,0.5972
6,2-year,On-goals,Out-goals,NN,0.6754,0.5203,0.7854,0.582,0.382,0.61
8,2-year,Out-goals,Out-goals,LGBoost,0.6838,0.803,0.6536,0.6407,0.7756,0.5179
10,2-year,Not-complications,Out-goals,NN,0.9202,0.7694,0.9712,0.7457,0.1555,0.5311
11,2-year,Yes-complications,Out-goals,LR,0.67,0.8023,0.5664,0.668,0.801,0.5068


In [21]:
# Renaming different columns
rename_dict = {"tgt_year" : "Horizon", 
               "base_label" : "Initial condition",
               "dep_var" : "Prediction", 
               "accuracy" : "Accuracy",
               "f1" : "F1-score", 
               "auc" : "AUC"}

## Generate results for naive model
subsample = pd.pivot(
    ordered_results[ordered_results['max_kappa'] == 1].rename(rename_dict, axis = 1), 
    index = ["max_kappa", 'Horizon', 'Initial condition',  'Prediction', 'Model'], 
    columns = "sample", 
    values = ['Accuracy', 
               'F1-score', 
               'AUC']
).reset_index()

# Rows sorting
subsample = pd.concat([subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "1-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "1-year")],
           subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "2-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "2-year")]])

# Final cleaning
subsample['Initial condition'] = subsample['Initial condition'].str.replace('\d+', '')

## Reorganise the columns
#subsample.loc[:, "index"] = [2, 4, 5, 6, 9, 10, 11, 12]
print("Results under optimal Kappa:")
paper_table = subsample.loc[:, [#('index',''), 
                  ('Horizon',''),
                    ('Initial condition',''),
                    ('Prediction',''),
                    ('Model',''),
                    ("Accuracy", "Training"), 
                    ("F1-score", "Training"), 
                    ("AUC", "Training"),
                    ("Accuracy", "Validation"), 
                    ("F1-score", "Validation"), 
                    ("AUC", "Validation")]].round(4)

paper_table.to_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\optimal_kappa_metrics.csv", 
                   sep = ';', 
                   index = False)

paper_table

Results under optimal Kappa:


Unnamed: 0_level_0,Horizon,Initial condition,Prediction,Model,Accuracy,F1-score,AUC,Accuracy,F1-score,AUC
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Training,Training,Training,Validation,Validation,Validation
0,1-year,On-goals,Out-goals,NN,0.8917,0.6862,0.9185,0.5794,0.3066,0.5654
2,1-year,Out-goals,Out-goals,LR,0.605,0.7172,0.5457,0.5224,0.5946,0.5499
4,1-year,Not-complications,Out-goals,NN,0.8227,0.5146,0.8028,0.7454,0.2532,0.5745
5,1-year,Yes-complications,Out-goals,LR,0.5456,0.639,0.5545,0.5959,0.7076,0.5258
1,1-year,On-goals,Yes-complications,AdaBoost,0.7209,0.8101,0.6842,0.6981,0.7992,0.6187
3,1-year,Out-goals,Yes-complications,GBC,0.7353,0.8141,0.7485,0.7037,0.8072,0.5972
6,2-year,On-goals,Out-goals,NN,0.7802,0.5042,0.7854,0.6417,0.375,0.61
8,2-year,Out-goals,Out-goals,LGBoost,0.6255,0.706,0.6536,0.5963,0.7003,0.5179
10,2-year,Not-complications,Out-goals,NN,0.9293,0.8287,0.9712,0.5681,0.3211,0.5311
11,2-year,Yes-complications,Out-goals,LR,0.552,0.6232,0.5664,0.6588,0.7822,0.5068


In [22]:
# Renaming different columns
rename_dict = {"tgt_year" : "Horizon", 
               "base_label" : "Initial condition",
               "dep_var" : "Prediction", 
               "accuracy" : "Accuracy",
               "f1" : "F1-score", 
               "auc" : "AUC"}

## Generate results for naive model
subsample = pd.pivot(
    ordered_results[ordered_results['min_diff'] == 1].rename(rename_dict, axis = 1), 
    index = ["min_diff", 'Horizon', 'Initial condition',  'Prediction', 'Model'], 
    columns = "sample", 
    values = ['Accuracy', 
               'F1-score', 
               'AUC']
).reset_index()

# Rows sorting
subsample = pd.concat([subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "1-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "1-year")],
           subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "2-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "2-year")]])

# Final cleaning
subsample['Initial condition'] = subsample['Initial condition'].str.replace('\d+', '')

## Reorganise the columns
#subsample.loc[:, "index"] = [2, 4, 5, 6, 9, 10, 11, 12]
print("Results under minimum Diff:")
paper_table = subsample.loc[:, [#('index',''), 
                  ('Horizon',''),
                    ('Initial condition',''),
                    ('Prediction',''),
                    ('Model',''),
                    ("Accuracy", "Training"), 
                    ("F1-score", "Training"), 
                    ("AUC", "Training"),
                    ("Accuracy", "Validation"), 
                    ("F1-score", "Validation"), 
                    ("AUC", "Validation")]].round(4)

paper_table.to_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\minimum_diff_metrics.csv", 
                   sep = ';', 
                   index = False)

paper_table

Results under minimum Diff:


Unnamed: 0_level_0,Horizon,Initial condition,Prediction,Model,Accuracy,F1-score,AUC,Accuracy,F1-score,AUC
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Training,Training,Training,Validation,Validation,Validation
0,1-year,On-goals,Out-goals,NN,0.1865,0.3032,0.9185,0.1924,0.3044,0.5654
2,1-year,Out-goals,Out-goals,LR,0.7223,0.8387,0.5457,0.7184,0.8359,0.5499
4,1-year,Not-complications,Out-goals,NN,0.1878,0.3161,0.8028,0.188,0.3161,0.5745
5,1-year,Yes-complications,Out-goals,LR,0.7354,0.8475,0.5545,0.7355,0.8475,0.5258
1,1-year,On-goals,Yes-complications,AdaBoost,0.6555,0.7909,0.6842,0.6538,0.7901,0.6187
3,1-year,Out-goals,Yes-complications,GBC,0.6667,0.7999,0.7485,0.6661,0.7992,0.5972
6,2-year,On-goals,Out-goals,NN,0.2282,0.3712,0.7854,0.228,0.3702,0.61
8,2-year,Out-goals,Out-goals,LGBoost,0.6663,0.7992,0.6536,0.658,0.7934,0.5179
10,2-year,Not-complications,Out-goals,NN,0.26,0.3517,0.9712,0.2298,0.3321,0.5311
11,2-year,Yes-complications,Out-goals,LR,0.6665,0.7998,0.5664,0.667,0.7998,0.5068
