# Este cuaderno busca el mejor modelo para cada horizonte temporal, estadío base y variable de salida

In [1]:
## Librerias necesarias
import pycaret
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment

import os
import re
import socket
import shutil
import warnings
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px 
from sklearn.metrics import confusion_matrix

## Configuración de rutas
if socket.gethostname()=='SRVCBECO01':
    os.chdir(r'C:\Users\juanpablo.martinez\Downloads\ryr_exes')
elif socket.gethostname() == 'HPJP2': 
    os.chdir(r'C:\Users\juanm\Dropbox\JP_files\UR\Vías clinicas diabetes')
elif socket.gethostname() == 'CNF106054': 
    os.chdir(r'C:\Users\paul.rodriguez\Dropbox\Salud Colombia\Diabetes Sanitas\Vías clinicas diabetes')
elif socket.gethostname() == 'CNF77701': 
    os.chdir(r'C:\Users\juanpablo.martinez\Dropbox\Vías clinicas diabetes')

    
raw_path = ''
created_path = ""
temp_path = ''
dbs_path = ""


print('Directorio actual: '+ os.getcwd())

Directorio actual: C:\Users\juanpablo.martinez\Downloads\ryr_exes


### Server packages versions

In [2]:
! pip list

Package                            Version




---------------------------------- --------------------
absl-py                            1.2.0
alabaster                          0.7.12
alembic                            1.11.2
anaconda-client                    1.9.0
anaconda-navigator                 2.1.2
anaconda-project                   0.10.1
ansi2html                          1.8.0
anyio                              2.2.0
appdirs                            1.4.4
argh                               0.26.2
argon2-cffi                        20.1.0
arrow                              0.13.1
asn1crypto                         1.4.0
astroid                            2.6.6
astropy                            4.3.1
astunparse                         1.6.3
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              21.2.0
autopep8                           1.5.7
Babel                              2.9.1
backcall                           0.2.0
backports.functools-lru-cache      1

## Funciones

### Definición de parámetros básicos

In [3]:
def parameters_gather(tgt_year, base_label, dep_var):
    
    ## Construyendo los paths de entrenamiento y validación
    train_path = dbs_path+"train\\train_db_{}_years_base_{}.csv".format(tgt_year, base_label)
    vali_path = dbs_path+"vali\\vali_db_{}_years_base_{}.csv".format(tgt_year, base_label)

    ## Listado de variables a eliminar 
    vars_to_drop = ['KeyAnonimo', 'year', 'base_label', 'tgt_label']
    if dep_var == "comorbilidades":
        vars_to_drop = vars_to_drop+['fuera_metas']
    elif dep_var == "fuera_metas":
        vars_to_drop = vars_to_drop+['comorbilidades']
    
    return train_path, vali_path, vars_to_drop

py                                 1.10.0
pyarrow                            12.0.1
pyasn1                             0.4.8
pyasn1-modules                     0.2.8
pybind11                           2.6.1
pycaret                            3.0.4
pycodestyle                        2.7.0
pycosat                            0.6.3
pycparser                          2.20
pyct                               0.4.8
pycurl                             7.44.1
pydantic                           1.8.2
pydocstyle                         6.1.1
pyerfa                             2.0.0
pyflakes                           2.3.1
Pygments                           2.10.0
PyGreSQL                           5.2.4
PyJWT                              2.1.0
pylint                             2.9.6
pyls-spyder                        0.4.0
PyNaCl                             1.4.0
pyod                               1.1.0
pyodbc                             4.0.0-unsupported
pyOpenSSL                          21.0.0


### Preparación de las bases de entrenamiento y validación

In [4]:
def data_prepare(df_path, vars_to_drop):
    ## Cargando la base de datos
    data_mat = pd.read_csv(df_path)
    
    ## Eliminando variables innecesarias
    data_mat.drop(vars_to_drop, axis = 1, inplace = True)
    
    return(data_mat.reset_index(drop = True))

### Identifcando variables categóricas y numéricas en la base de datos

In [5]:
def cat_num_identify(ml_data, rename_df):
    categorical_vars_master = ['femenino', 'ERC_high', 'ERC1', 'ERC2', 'adhiere_guia', 'no_adhiere', 'adhiere_colesterol', 'analgesicos',
                                   'antiacidos', 'antihipertensivos', 'hipoglicemiantes', 'hipolipemiantes', 'nutrition_tag_max', 
                                   'exercise_tag_max', 'alcohol_tag_max', 'tobaco_tag_max', 'creatinina_missing']

    categorical_vars = list(set(ml_data.columns).intersection(categorical_vars_master))

    numerical_vars = set(ml_data.columns).difference(set(categorical_vars+['KeyAnonimo', 'year', 'base_label', 'comorbilidades', 'fuera_metas', 'tgt_label']))
    numerical_vars = list(numerical_vars)

    types_df = pd.DataFrame({'old_name' : categorical_vars+numerical_vars,
                             'type' : ['categorical']*len(categorical_vars)+['numerical']*len(numerical_vars)})

    types_df = types_df.merge(rename_df, on = 'old_name', how = 'inner')

    new_cat_vars = list(types_df.loc[types_df['type'] == 'categorical', 'new_name'].values)
    new_num_vars = list(types_df.loc[types_df['type'] == 'numerical', 'new_name'].values)
    
    return new_cat_vars, new_num_vars

## Parámetros básicos

In [6]:
## Diccionario para renombrar variables
rename_dict ={'femenino' : 'Female', 'edad' : 'Age (Years)', 'peso': 'Weight (Kg)', 'talla' : 'Height (m)', 
              'imc' : "BMI (kg/m*m)", 'Colesterol_LDL' : 'LDL Chol. (mg/dL)', 'TFG' : "eGFR (mg/g)", 
              "ta_diastolica" : "Diast. B.P. (mmHg)", 'ta_sistolica' : 'Sist. B.P. (mmHg)', 'adhiere_guia' : "Hba1c guide Adh.", 
              "no_adhiere" : "Pharma. Adh.", "analgesicos" : "Analgesics", "antiacidos" : "Antacids",
              "antihipertensivos" : "Antihypertensive", "hipoglicemiantes" : "Hypoglecimic agents", 
              "hipolipemiantes" : "Lipid-lowering agents", "nutrition_tag_max" : "Nutrition recomm.", 
              "exercise_tag_max" : "Physical act. recomm.", "alcohol_tag_max" : "Alcohol recomm.",
              "tobaco_tag_max" : "Tobacco recomm.", "creatinina" : "Creatinine (mg/dL)", 
              "adhiere_colesterol" : "Chol. Adh.", 
             "ERC1" : "CKD-1", "ERC2" : "CKD-2", "ERC3" : "CKD-Late"} 

#'imc': r"BMI ($\displaystyle\frac{kg}{m^2}$)"}

rename_df = pd.DataFrame(rename_dict, index = rename_dict.keys())
rename_df = pd.DataFrame({'old_name' : rename_df.index, 'new_name' : np.diag(rename_df)})

## Generating the requested plots and tables - Reviewer 4 

### Feature importance of two models

In [7]:
dep_var = "fuera_metas"
tgt_year = 1
base_label = 2

print("Training horizon: {}, Initial state: {}, Outcome: {}.".format(tgt_year, base_label, dep_var))

# Generando las rutas y variables a eliminar
train_path, vali_path, vars_to_drop = parameters_gather(tgt_year = tgt_year, 
                                                        base_label = base_label,
                                                        dep_var = dep_var)

## Preparando los datos
train = data_prepare(df_path = train_path, vars_to_drop = vars_to_drop)
#train = pd.read_csv(train_path)
#train = train.loc[:, list(set(train.columns).difference(set(vars_to_drop)))]
vali = data_prepare(df_path = vali_path, vars_to_drop = vars_to_drop)

## Identificando variables categóricas y numéricas presentes
cat_vars, num_vars = cat_num_identify(ml_data = train, rename_df = rename_df)

## Renombrando las variables para el gráfico
train.rename(rename_dict, axis = 1, inplace = True)
vali.rename(rename_dict, axis = 1, inplace = True)

## Configuración del experimento
s = setup(data = train,
          target = dep_var,
          test_data = vali,
          session_id = 123,
          index = False,
          numeric_features = num_vars,
          categorical_features = cat_vars,
          remove_multicollinearity = True,
          fix_imbalance = False,
          feature_selection = True,
          #feature_selection = False,
          feature_selection_method = "sequential",
          n_features_to_select = 0.5,
          use_gpu = False, 
         # log_experiment = True
         )

# compare baseline models
best = compare_models(sort = "F1", include = ['lr'])

# Generate feature importance plot 
plot_model(best, plot = 'feature', save = True)
shutil.copyfile(".\\"+"Feature Importance.png",
                ".\\"+temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\features_{}_years_base_{}_{}.png".format(tgt_year, base_label, dep_var))
os.remove(".\\"+"Feature Importance.png")

Training horizon: 1, Initial state: 2, Outcome: fuera_metas.


Unnamed: 0,Description,Value
0,Session id,123
1,Target,fuera_metas
2,Target type,Binary
3,Original data shape,"(4081, 24)"
4,Transformed data shape,"(4081, 12)"
5,Transformed train set shape,"(2856, 12)"
6,Transformed test set shape,"(1225, 12)"
7,Ordinal features,13
8,Numeric features,9
9,Categorical features,13


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7223,0.5145,1.0,0.7223,0.8388,0.0,0.0,8.278


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [8]:
dep_var = "fuera_metas"
tgt_year = 1
base_label = 4

print("Training horizon: {}, Initial state: {}, Outcome: {}.".format(tgt_year, base_label, dep_var))

# Generando las rutas y variables a eliminar
train_path, vali_path, vars_to_drop = parameters_gather(tgt_year = tgt_year, 
                                                        base_label = base_label,
                                                        dep_var = dep_var)

## Preparando los datos
train = data_prepare(df_path = train_path, vars_to_drop = vars_to_drop)
#train = pd.read_csv(train_path)
#train = train.loc[:, list(set(train.columns).difference(set(vars_to_drop)))]
vali = data_prepare(df_path = vali_path, vars_to_drop = vars_to_drop)

## Identificando variables categóricas y numéricas presentes
cat_vars, num_vars = cat_num_identify(ml_data = train, rename_df = rename_df)

## Renombrando las variables para el gráfico
train.rename(rename_dict, axis = 1, inplace = True)
vali.rename(rename_dict, axis = 1, inplace = True)

## Configuración del experimento
s = setup(data = train,
          target = dep_var,
          test_data = vali,
          session_id = 123,
          index = False,
          numeric_features = num_vars,
          categorical_features = cat_vars,
          remove_multicollinearity = True,
          fix_imbalance = False,
          feature_selection = True,
          #feature_selection = False,
          feature_selection_method = "sequential",
          n_features_to_select = 0.5,
          use_gpu = False, 
         # log_experiment = True
         )

# compare baseline models
best = compare_models(sort = "F1", include = ['lr'])

# Generate feature importance plot 
plot_model(best, plot = 'feature', save = True)
shutil.copyfile(".\\"+"Feature Importance.png",
                ".\\"+temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\features_{}_years_base_{}_{}.png".format(tgt_year, base_label, dep_var))
os.remove(".\\"+"Feature Importance.png")

Training horizon: 1, Initial state: 4, Outcome: fuera_metas.


Unnamed: 0,Description,Value
0,Session id,123
1,Target,fuera_metas
2,Target type,Binary
3,Original data shape,"(7926, 27)"
4,Transformed data shape,"(7926, 14)"
5,Transformed train set shape,"(5548, 14)"
6,Transformed test set shape,"(2378, 14)"
7,Ordinal features,15
8,Numeric features,9
9,Categorical features,15


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7358,0.549,1.0,0.7358,0.8478,0.0,0.0,7.659


Processing:   0%|          | 0/9 [00:00<?, ?it/s]

### Confusion matrix of two models

In [58]:
## Load the probabilities matrices
ordered_results = pd.read_csv(temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\{}.csv".format("stats_res"), 
                        sep = ';')



confusion_entries = ordered_results.loc[(ordered_results['max_kappa'] == 1), 
                    ["tgt_year", "base_label", "dep_var", "sample", "Model", "tp", "tn", "fp", "fn"]]

# Compute rates
confusion_entries['TPR'] = confusion_entries['tp']/(confusion_entries['tp']+confusion_entries['fn'])
confusion_entries['FNR'] = 1-confusion_entries['TPR']
confusion_entries['FPR'] = confusion_entries['fp']/(confusion_entries['fp']+confusion_entries['tn'])
confusion_entries['TNR'] = 1-confusion_entries['FPR']


# Change the variables' labels
confusion_entries["tgt_year"].replace(to_replace = [1, 2], 
                                      value = ["1-year", 
                                               "2-year"], 
                                        inplace = True)

confusion_entries["base_label"].replace(to_replace = [1, 2, 3, 4], 
                                      value = ["1 On-goals", 
                                                 "2 Out-goals", 
                                                 "3 Not-complications", 
                                                 "4 Yes-complications"], 
                                        inplace = True)

confusion_entries["dep_var"].replace(to_replace = ["fuera_metas", "comorbilidades"], 
                                      value = ["Out-goals", "Yes-complications"], 
                                        inplace = True)

confusion_entries["sample"].replace(to_replace = ["training", "validation"], 
                                      value = ["Training", "Validation"],
                                        inplace = True)

confusion_entries.replace(to_replace = ['LogisticRegression', 'AdaBoostClassifier', 'GradientBoostingClassifier', 'LGBMClassifier'], 
                          value = ["LR", "AdaBoost", "GBC", "LGBoost"], 
                          inplace = True)

# Renaming different columns
rename_dict = {"tgt_year" : "Horizon", 
               "base_label" : "Initial condition",
               "dep_var" : "Prediction", 
               "accuracy" : "Accuracy",
               "f1" : "F1-score", 
               "auc" : "AUC", 
               "tp" : "True Pos.", 
               "tn" : "True Neg.", 
               "fp" : "False Pos.", 
               "fn" : "False Neg."}

## Generate results for naive model
subsample = pd.pivot(confusion_entries.rename(rename_dict, axis = 1), 
                     index = ['Horizon', 'Initial condition',  'Prediction', "Model"], 
                     columns = "sample", 
                     values = ["True Pos.", 
                              "True Neg.", 
                              "False Pos.", 
                              "False Neg."]).reset_index()

# Rows sorting
subsample = pd.concat([subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "1-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "1-year")],
           subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "2-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "2-year")]])

# Final cleaning
subsample['Initial condition'] = subsample['Initial condition'].str.replace('\d+', '')
cols_order = [('Horizon',''),
              ('Initial condition',''),
              ('Prediction',''),
              ('Model','')] + [(y, x) for x in ["Training", 
                                                "Validation"] for y in ["True Pos.", 
                                                                        "True Neg.", 
                                                                        "False Pos.", 
                                                                        "False Neg."]]
subsample = subsample.loc[:, cols_order].round(4)
subsample.to_csv("ryr_results\\baseline_replicate\\predicted_probabilities\\confusion_matrices.csv", 
                 sep = ";", index = False)


print("Confusion matrix under optimal cutoff:")
subsample

Confusion matrix under optimal cutoff:


Unnamed: 0_level_0,Horizon,Initial condition,Prediction,Model,True Pos.,True Neg.,False Pos.,False Neg.,True Pos.,True Neg.,False Pos.,False Neg.
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Training,Training,Training,Training,Validation,Validation,Validation,Validation
0,1-year,On-goals,Out-goals,NN,386,2521,162,191,130,680,470,118
2,1-year,Out-goals,Out-goals,LR,1430,298,495,633,429,211,130,455
4,1-year,Not-complications,Out-goals,NN,554,4294,493,552,109,1774,278,365
5,1-year,Yes-complications,Out-goals,LR,2231,796,670,1851,1163,254,375,586
1,1-year,On-goals,Yes-complications,AdaBoost,1941,409,727,183,840,136,351,71
3,1-year,Out-goals,Yes-complications,GBC,1655,445,508,248,760,102,307,56
6,2-year,On-goals,Out-goals,NN,240,1435,223,249,99,492,220,110
8,2-year,Out-goals,Out-goals,LGBoost,849,332,302,405,382,101,171,156
10,2-year,Not-complications,Out-goals,NN,375,1662,90,65,96,438,313,93
11,2-year,Yes-complications,Out-goals,LR,841,412,338,679,596,45,277,55


In [57]:
## Generate results for naive model
subsample = pd.pivot(confusion_entries.rename(rename_dict, axis = 1), 
                     index = ['Horizon', 'Initial condition',  'Prediction', "Model"], 
                     columns = "sample", 
                     values = ["TPR", 
                              "FPR", 
                              "TNR", 
                              "FNR"]).reset_index()

# Rows sorting
subsample = pd.concat([subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "1-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "1-year")],
           subsample[(subsample['Prediction'] == "Out-goals") & (subsample['Horizon'] == "2-year")], 
           subsample[(subsample['Prediction'] == "Yes-complications") & (subsample['Horizon'] == "2-year")]])

# Final cleaning
subsample['Initial condition'] = subsample['Initial condition'].str.replace('\d+', '')
cols_order = [('Horizon',''),
              ('Initial condition',''),
              ('Prediction',''),
              ('Model','')] + [(y, x) for x in ["Training", 
                                                "Validation"] for y in ["TPR", 
                                                                          "FPR", 
                                                                          "TNR", 
                                                                          "FNR"]]
subsample = subsample.loc[:, cols_order].round(4)
subsample.to_csv("ryr_results\\baseline_replicate\\predicted_probabilities\\confusion_matrices_rates.csv", 
                 sep = ";", index = False)


print("Confusion matrix under optimal cutoff:")
subsample

Confusion matrix under optimal cutoff:


Unnamed: 0_level_0,Horizon,Initial condition,Prediction,Model,TPR,FPR,TNR,FNR,TPR,FPR,TNR,FNR
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Training,Training,Training,Training,Validation,Validation,Validation,Validation
0,1-year,On-goals,Out-goals,NN,0.669,0.0604,0.9396,0.331,0.5242,0.4087,0.5913,0.4758
2,1-year,Out-goals,Out-goals,LR,0.6932,0.6242,0.3758,0.3068,0.4853,0.3812,0.6188,0.5147
4,1-year,Not-complications,Out-goals,NN,0.5009,0.103,0.897,0.4991,0.23,0.1355,0.8645,0.77
5,1-year,Yes-complications,Out-goals,LR,0.5465,0.457,0.543,0.4535,0.665,0.5962,0.4038,0.335
1,1-year,On-goals,Yes-complications,AdaBoost,0.9138,0.64,0.36,0.0862,0.9221,0.7207,0.2793,0.0779
3,1-year,Out-goals,Yes-complications,GBC,0.8697,0.5331,0.4669,0.1303,0.9314,0.7506,0.2494,0.0686
6,2-year,On-goals,Out-goals,NN,0.4908,0.1345,0.8655,0.5092,0.4737,0.309,0.691,0.5263
8,2-year,Out-goals,Out-goals,LGBoost,0.677,0.4763,0.5237,0.323,0.71,0.6287,0.3713,0.29
10,2-year,Not-complications,Out-goals,NN,0.8523,0.0514,0.9486,0.1477,0.5079,0.4168,0.5832,0.4921
11,2-year,Yes-complications,Out-goals,LR,0.5533,0.4507,0.5493,0.4467,0.9155,0.8602,0.1398,0.0845


### Barplot of F1-scores 

In [47]:
## Load the long panel of results
#stats_long = pd.read_csv(temp_path + 'original_submission_res_long.csv', 
#                         sep = ';')

stats_long = pd.read_csv(temp_path + "kappa_opt_long_res.csv", sep = ';')
stats_long = stats_long[stats_long['max_kappa'] == 1]

# Correct the base labels and the variables names
stats_long["base_label"].replace(to_replace = ["1 On-target", 
                                                 "2 Out-target", 
                                                 "3 Not-complications", 
                                                 "4 Yes-complications"], 
                                      value = ["On-target", 
                                                 "Out-target", 
                                                 "Not-complications", 
                                                 "Yes-complications"], 
                                        inplace = True)

stats_long.rename(columns = {"tgt_year" : "Horizon", 
                             "sample" : "Sample", 
                             "f1" : "F1 Score"}, 
                  inplace = True)

stats_long["Init. Cond.-Pred."] = stats_long['base_label']+"-"+stats_long['dep_var']

## Generate the barplot depending on the horizon 

fig = px.bar(stats_long, 
       x = "Init. Cond.-Pred.", y = "F1 Score",
       facet_row = 'Horizon', facet_col = "Sample", color = 'Init. Cond.-Pred.')

fig.update_layout(legend = dict(orientation = 'h', 
                               xanchor = 'left', 
                               x = -0.1))
fig.update_xaxes(visible=False)
pio.write_image(fig,
                temp_path+"ryr_results\\baseline_replicate\\predicted_probabilities\\"+"f1_score_per_model.jpg", 
                format="jpeg", 
                scale=1, width=1000, height=800)