In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Packages

In [2]:
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split
from pycaret.classification import *
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 120)

In [23]:
PATH_REPO = Path(os.getcwd()).parent
PATH_DATA = PATH_REPO / "data"
PATH_DATA_INPUT = PATH_DATA / "input"
GWG = "GWG"
TARGET = "IS_SICK"
PATH_REPO

WindowsPath('D:/git-projects/optiGWG')

# Read Data

In [24]:
process = pd.read_csv(PATH_DATA_INPUT / "01_process.csv", index_col=0)
with open(PATH_REPO / "data/models/numeric_features_names.pkl", 'rb') as f:
    numeric_features_names = pickle.load(f)
with open(PATH_REPO / "data/models/categoric_features_names.pkl", 'rb') as f:
    categoric_features_names = pickle.load(f)
with open(PATH_REPO / "data/models/targets_names.pkl", 'rb') as f:
    targets_names = pickle.load(f)
print("Total columns names: Numerics ({}) + Categoric ({}) + Targets ({}) = {}".format(len(numeric_features_names), len(categoric_features_names), len(targets_names), 
                                                                                       len(numeric_features_names)+ len(categoric_features_names)+ len(targets_names) ))
process

Total columns names: Numerics (12) + Categoric (89) + Targets (4) = 105


Unnamed: 0,LGA,SGA,hipertension,diabetes,edad_a,paf_no_a,paf_diab_a,paf_tbcp_a,paf_hipe_a,paf_aler_a,...,grpsanr_a_1,grpsanr_a_2,grpsanr_a_5,grpsanr_a_infrequent_sklearn,previsio_a_1,previsio_a_2,previsio_a_3,previsio_a_4,previsio_a_5,previsio_a_6
0,0,0,0,0,39.000000,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,24.000000,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,0,28.000000,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,0,0,24.000000,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,1,0,0,0,25.000000,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93942,0,0,0,0,27.223962,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
93943,1,0,0,0,27.223962,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
93944,1,0,0,0,27.223962,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
93945,1,0,0,0,27.223962,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


# Create new target

In [25]:
data_target = process.copy()
data_target[TARGET] = data_target[targets_names].apply(lambda x: max(x), axis=1)
data_target.drop(columns=targets_names, inplace=True)
data_target[TARGET].mean()

0.4199814789189649

# Compare Models

In [41]:
exclude=['knn']

clf = setup(data = data_target, target = TARGET, train_size=0.7 ,preprocess=True,
            verbose=True, silent=True,  numeric_features=numeric_features_names+categoric_features_names, 
            feature_selection = True, remove_multicollinearity = True, feature_selection_threshold=.3, ignore_low_variance = True)

best = compare_models(sort = 'AUC', exclude=exclude)
best_models_dict = {"best": best, "setup": clf, "scoring_grid": pull()}


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6212,0.629,0.336,0.5842,0.4266,0.1734,0.1883,9.751
lightgbm,Light Gradient Boosting Machine,0.6206,0.6268,0.2986,0.5952,0.3976,0.1631,0.1838,0.347
gbc,Gradient Boosting Classifier,0.6176,0.6199,0.2723,0.5966,0.3739,0.1506,0.1747,3.066
xgboost,Extreme Gradient Boosting,0.6151,0.6174,0.3497,0.5666,0.4325,0.1652,0.1763,3.183
ada,Ada Boost Classifier,0.6034,0.6036,0.2733,0.5552,0.3663,0.1239,0.1404,0.859
rf,Random Forest Classifier,0.6088,0.6025,0.3022,0.5626,0.3932,0.1415,0.1565,2.515
qda,Quadratic Discriminant Analysis,0.5841,0.5741,0.3656,0.5092,0.4209,0.1118,0.117,0.101
lr,Logistic Regression,0.5994,0.5725,0.1434,0.593,0.2308,0.0806,0.118,1.581
lda,Linear Discriminant Analysis,0.5997,0.5724,0.1432,0.595,0.2308,0.0812,0.1191,0.171
et,Extra Trees Classifier,0.5819,0.57,0.3149,0.5025,0.3872,0.0947,0.1006,3.168


In [42]:
plots_names = ['confusion_matrix', 'feature_all', 'auc']
for pn in plots_names:
    path = PATH_DATA / "plots/{}/{}".format(TARGET, pn)
    plot_model(best, plot = pn, save=path)
    print("file saved in", path)

new_categoric_features_names = []
new_numeric_features_names = []
for f in best.feature_names_:
    for f_cat in categoric_features_names:
        if f_cat in f:
            new_categoric_features_names.append(f_cat)
    if f in numeric_features_names:
        new_numeric_features_names.append(f)
        


for file_name, elem in zip(("best_model", "categoric_features_names", "numeric_features_names"),(best, new_categoric_features_names, new_numeric_features_names)):
    path = PATH_DATA / "models/{}/{}.pkl".format(TARGET, file_name)
    with open(path, "wb") as of:
        pickle.dump(elem, of)
        print("file saved in", path)
    
scoring_grid = pull()
path = PATH_DATA / "output/{}/scoring_grid.xlsx".format(TARGET)
scoring_grid.to_excel(path)
print("file saved in", path)

scoring_grid

file saved in D:\git-projects\optiGWG\data\plots\IS_SICK\auc
file saved in D:\git-projects\optiGWG\data\models\IS_SICK\best_model.pkl
file saved in D:\git-projects\optiGWG\data\models\IS_SICK\categoric_features_names.pkl
file saved in D:\git-projects\optiGWG\data\models\IS_SICK\numeric_features_names.pkl
file saved in D:\git-projects\optiGWG\data\output\IS_SICK\scoring_grid.xlsx


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6212,0.629,0.336,0.5842,0.4266,0.1734,0.1883,9.751
lightgbm,Light Gradient Boosting Machine,0.6206,0.6268,0.2986,0.5952,0.3976,0.1631,0.1838,0.347
gbc,Gradient Boosting Classifier,0.6176,0.6199,0.2723,0.5966,0.3739,0.1506,0.1747,3.066
xgboost,Extreme Gradient Boosting,0.6151,0.6174,0.3497,0.5666,0.4325,0.1652,0.1763,3.183
ada,Ada Boost Classifier,0.6034,0.6036,0.2733,0.5552,0.3663,0.1239,0.1404,0.859
rf,Random Forest Classifier,0.6088,0.6025,0.3022,0.5626,0.3932,0.1415,0.1565,2.515
qda,Quadratic Discriminant Analysis,0.5841,0.5741,0.3656,0.5092,0.4209,0.1118,0.117,0.101
lr,Logistic Regression,0.5994,0.5725,0.1434,0.593,0.2308,0.0806,0.118,1.581
lda,Linear Discriminant Analysis,0.5997,0.5724,0.1432,0.595,0.2308,0.0812,0.1191,0.171
et,Extra Trees Classifier,0.5819,0.57,0.3149,0.5025,0.3872,0.0947,0.1006,3.168


In [43]:
new_numeric_features_names

['edad_a', 'nrocontr_a', 'ppr_a', 'GWG', 'ae_a', 'inicontp_a', 'pter_a', 'IMC']

In [44]:
new_categoric_features_names

['mecr_des_a',
 'grpsanc_a_1',
 'estudio_a_4',
 'est_civil_a_1',
 'est_civil_a_2',
 'paf_hipe_a',
 'paf_diab_a',
 'previsio_a_4',
 'previsio_a_1',
 'activida_a_6',
 'paf_no_a',
 'activida_a_2',
 'estnutri_a_3',
 'estudio_a_2',
 'estudio_a_3',
 'comuna_a_puente alto',
 'pam_no_a',
 'previsio_a_6',
 'anticonc_a_3',
 'mecr_no_a']

In [37]:
categoric_features_names

['activida_a_1',
 'activida_a_2',
 'activida_a_3',
 'activida_a_4',
 'activida_a_5',
 'activida_a_6',
 'activida_a_infrequent_sklearn',
 'alcohol_a_2',
 'alcohol_a_infrequent_sklearn',
 'anticonc_a_3',
 'anticonc_a_4',
 'anticonc_a_5',
 'anticonc_a_infrequent_sklearn',
 'cigardia_a_2',
 'cigardia_a_3',
 'cigardia_a_infrequent_sklearn',
 'comuna_a_la pintana',
 'comuna_a_pirque',
 'comuna_a_puente alto',
 'comuna_a_san jose maipo',
 'comuna_a_infrequent_sklearn',
 'est_civil_a_1',
 'est_civil_a_2',
 'est_civil_a_3',
 'est_civil_a_4',
 'estnutri_a_1',
 'estnutri_a_2',
 'estnutri_a_3',
 'estnutri_a_4',
 'estudio_a_1',
 'estudio_a_2',
 'estudio_a_3',
 'estudio_a_4',
 'estudio_a_5',
 'grpsanc_a_1',
 'grpsanc_a_2',
 'grpsanc_a_3',
 'grpsanc_a_4',
 'grpsanc_a_5',
 'grpsanr_a_1',
 'grpsanr_a_2',
 'grpsanr_a_5',
 'grpsanr_a_infrequent_sklearn',
 'previsio_a_1',
 'previsio_a_2',
 'previsio_a_3',
 'previsio_a_4',
 'previsio_a_5',
 'previsio_a_6',
 'dro_amfe_a',
 'dro_benz_a',
 'dro_coca_a',
 'dro

In [31]:
categoric_features_names

['activida_a_1',
 'activida_a_2',
 'activida_a_3',
 'activida_a_4',
 'activida_a_5',
 'activida_a_6',
 'activida_a_infrequent_sklearn',
 'alcohol_a_2',
 'alcohol_a_infrequent_sklearn',
 'anticonc_a_3',
 'anticonc_a_4',
 'anticonc_a_5',
 'anticonc_a_infrequent_sklearn',
 'cigardia_a_2',
 'cigardia_a_3',
 'cigardia_a_infrequent_sklearn',
 'comuna_a_la pintana',
 'comuna_a_pirque',
 'comuna_a_puente alto',
 'comuna_a_san jose maipo',
 'comuna_a_infrequent_sklearn',
 'est_civil_a_1',
 'est_civil_a_2',
 'est_civil_a_3',
 'est_civil_a_4',
 'estnutri_a_1',
 'estnutri_a_2',
 'estnutri_a_3',
 'estnutri_a_4',
 'estudio_a_1',
 'estudio_a_2',
 'estudio_a_3',
 'estudio_a_4',
 'estudio_a_5',
 'grpsanc_a_1',
 'grpsanc_a_2',
 'grpsanc_a_3',
 'grpsanc_a_4',
 'grpsanc_a_5',
 'grpsanr_a_1',
 'grpsanr_a_2',
 'grpsanr_a_5',
 'grpsanr_a_infrequent_sklearn',
 'previsio_a_1',
 'previsio_a_2',
 'previsio_a_3',
 'previsio_a_4',
 'previsio_a_5',
 'previsio_a_6',
 'dro_amfe_a',
 'dro_benz_a',
 'dro_coca_a',
 'dro