In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, mean_squared_error, accuracy_score, mean_absolute_error, precision_score, recall_score, f1_score
from pickle import dump
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', 200)

In [12]:
hospital1 = 'Anam_hospital'
hospital2 = 'Guro_hospital'
hospital3 = 'Ansan_hospital'

# Model train

In [14]:
def nan_del(X, y):
    na_index = np.where(np.isnan(y))[0]
    del_na_X = np.delete(X, na_index,axis=0)
    del_na_y = np.delete(y, na_index)
    
    return del_na_X, del_na_y

In [15]:
def performacne_model(Real_Test_y, y_pred, y_pred_prob):
    accuracy= accuracy_score(Real_Test_y, y_pred)
    precision = precision_score(Real_Test_y, y_pred)
    recall = recall_score(Real_Test_y, y_pred)
    f1 = f1_score(Real_Test_y, y_pred)
    auc = roc_auc_score(Real_Test_y, y_pred_prob)
    prc = average_precision_score(Real_Test_y, y_pred_prob)
    confusion = confusion_matrix(Real_Test_y, y_pred)
    
    return [accuracy, precision, recall, f1, auc, prc, confusion]

In [16]:
def LGBM_tuning_predict(Train_X_sc_MI, Train_y, Real_Test_X_sc, Real_Test_y, External_Test_X1, External_Test_y1):
    test_auc_list = []

    pram = {"max_depth":[15,20,25,30],
                 "n_estimators": [100,200,300]}
    
    LGBM_model = LGBMClassifier(force_col_wise=True,verbose=-1)
    
    gscv = GridSearchCV(estimator=LGBM_model, param_grid=pram, scoring='roc_auc', cv=5, verbose=2)
    
    gscv.fit(Train_X_sc_MI, Train_y)
    
    external_y_pred1 = gscv.predict(External_Test_X1)
    external_y_pred_prob1 = gscv.predict_proba(External_Test_X1)[:, 1]
    external_performance = performacne_model(External_Test_y1, external_y_pred1, external_y_pred_prob1)
    
    return performance, external_performance1, gscv

In [17]:
def Cat_tuning_predict(Train_X_sc_MI, Train_y, Real_Test_X_sc, Real_Test_y, External_Test_X1, External_Test_y1):
    test_auc_list = []

    pram = {"max_depth":[6,8,10,12],
                 "n_estimators": [100,200,300]}
    
    Cat_model = CatBoostClassifier(verbose=False)
    
    gscv = GridSearchCV(estimator=Cat_model, param_grid=pram, scoring='roc_auc', cv=5, verbose=2)
    
    gscv.fit(Train_X_sc_MI, Train_y)
    
    external_y_pred1 = gscv.predict(External_Test_X1)
    external_y_pred_prob1 = gscv.predict_proba(External_Test_X1)[:, 1]
    external_performance = performacne_model(External_Test_y1, external_y_pred1, external_y_pred_prob1)
    
    return performance, external_performance1, gscv

In [18]:
def XGB_tuning_predict(Train_X_sc_MI, Train_y, Real_Test_X_sc, Real_Test_y, External_Test_X1, External_Test_y1):
    test_auc_list = []

    pram = {"max_depth":[15,20,25,30],
                 "n_estimators": [100,200,300]}
    
    XGB = XGBClassifier()
    
    gscv = GridSearchCV(estimator=XGB, param_grid=pram, scoring='roc_auc', cv=5, verbose=2)
    
    gscv.fit(Train_X_sc_MI, Train_y)
    
    y_pred = gscv.predict(Real_Test_X_sc)
    y_pred_prob = gscv.predict_proba(Real_Test_X_sc)[:, 1]
    performance = performacne_model(Real_Test_y, y_pred, y_pred_prob)
    
    external_y_pred1 = gscv.predict(External_Test_X1)
    external_y_pred_prob1 = gscv.predict_proba(External_Test_X1)[:, 1]
    external_performance = performacne_model(External_Test_y1, external_y_pred1, external_y_pred_prob1)
    
    return performance, external_performance1, gscv

In [19]:
def LR_tuning_predict(Train_X_sc_MI, Train_y, Real_Test_X_sc, Real_Test_y, External_Test_X1, External_Test_y1):
    test_auc_list = []

    pram = {"C":[0.5, 1.0, 2, 3],
                 "max_iter": [50,100,200,300]}
    
    LR = LogisticRegression()
    
    gscv = GridSearchCV(estimator=LR, param_grid=pram, scoring='roc_auc', cv=5, verbose=2)
    
    gscv.fit(Train_X_sc_MI, Train_y)
    
    y_pred = gscv.predict(Real_Test_X_sc)
    y_pred_prob = gscv.predict_proba(Real_Test_X_sc)[:, 1]
    performance = performacne_model(Real_Test_y, y_pred, y_pred_prob)
    
    external_y_pred1 = gscv.predict(External_Test_X1)
    external_y_pred_prob1 = gscv.predict_proba(External_Test_X1)[:, 1]
    external_performance = performacne_model(External_Test_y1, external_y_pred1, external_y_pred_prob1)
    
    return performance, external_performance1, gscv

In [20]:
count=0
external_count=0
target_name = ['tgc','m','fgc','f','cp','c','a','ag','ㅎ']
perform = ['accuracy','precision','recall','f1','auc','prc']
target_list =['train_length','valid_length','test_length']
for t in target_name:
    for p in perform:
        target_list.append(f'{t}_{p}')
performance_result_df = pd.DataFrame(columns=target_list)
performance_external_result_df1 = pd.DataFrame(columns=target_list)

for seed in range(30):
    Train_X_MI = np.load('Preprocessed training data X in .npy format')
    Train_y = np.load('Preprocessed training data y in .npy format')
    Test_X_MI= np.load('Preprocessed internal test data X in .npy format')
    Test_y = np.load('Preprocessed internal test data y in .npy format')
    external_test_X1 = np.load('Preprocessed external test data X in .npy format')
    external_test_y1 = np.load('Preprocessed external test data y in .npy format')
    
    tgc_train_y = Train_y[:,0]
    m_train_y = Train_y[:,1]
    fgc_train_y = Train_y[:,2]
    f_train_y = Train_y[:,3]
    cp_train_y = Train_y[:,4]
    c_train_y = Train_y[:,5]
    a_train_y = Train_y[:,6]
    ag_train_y = Train_y[:,7]
    g_train_y = Train_y[:,8]

    tgc_test_y = Test_y[:,0]
    m_test_y = Test_y[:,1]
    fgc_test_y = Test_y[:,2]
    f_test_y = Test_y[:,3]
    cp_test_y = Test_y[:,4]
    c_test_y = Test_y[:,5]
    a_test_y = Test_y[:,6]
    ag_test_y = Test_y[:,7]
    g_test_y = Test_y[:,8]

    external_tgc_test_y1 = external_test_y1[:,0]
    external_m_test_y1 = external_test_y1[:,1]
    external_fgc_test_y1 = external_test_y1[:,2]
    external_f_test_y1 = external_test_y1[:,3]
    external_cp_test_y1 = external_test_y1[:,4]
    external_c_test_y1 = external_test_y1[:,5]
    external_a_test_y1 = external_test_y1[:,6]
    external_ag_test_y1 = external_test_y1[:,7]
    external_g_test_y1 = external_test_y1[:,8]

    train_y_list = [tgc_train_y,m_train_y,fgc_train_y,f_train_y,cp_train_y,c_train_y,a_train_y,ag_train_y,g_train_y]
    test_y_list = [tgc_test_y,m_test_y,fgc_test_y,f_test_y,cp_test_y,c_test_y,a_test_y,ag_test_y,g_test_y]
    external_test_y1_list = [external_tgc_test_y1,external_m_test_y1,external_fgc_test_y1,external_f_test_y1,external_cp_test_y1,external_c_test_y1,external_a_test_y1,external_ag_test_y1,external_g_test_y1]

    Train_index, Val_index = train_test_split(range(len(Train_X_MI)), test_size=0.3, random_state=seed)
    X_val = Train_X_MI[Val_index]
    Y_val_tgc = train_y_list[0][Val_index]
    Y_val_m = train_y_list[1][Val_index]
    Y_val_fgc = train_y_list[2][Val_index]
    Y_val_f = train_y_list[3][Val_index]
    Y_val_cp = train_y_list[4][Val_index]
    Y_val_c = train_y_list[5][Val_index]
    Y_val_a = train_y_list[6][Val_index]
    Y_val_ag = train_y_list[7][Val_index]
    Y_val_g = train_y_list[8][Val_index]


    X_train = Train_X_MI[Train_index]
    Y_train_tgc = train_y_list[0][Train_index]
    Y_train_m = train_y_list[1][Train_index]
    Y_train_fgc = train_y_list[2][Train_index]
    Y_train_f = train_y_list[3][Train_index]
    Y_train_cp = train_y_list[4][Train_index]
    Y_train_c = train_y_list[5][Train_index]
    Y_train_a = train_y_list[6][Train_index]
    Y_train_ag = train_y_list[7][Train_index]
    Y_train_g = train_y_list[8][Train_index]

    X_test = Test_X_MI
    Y_test_tgc = test_y_list[0]
    Y_test_m = test_y_list[1]
    Y_test_fgc = test_y_list[2]
    Y_test_f = test_y_list[3]
    Y_test_cp = test_y_list[4]
    Y_test_c = test_y_list[5]
    Y_test_a = test_y_list[6]
    Y_test_ag = test_y_list[7]
    Y_test_g = test_y_list[8]

    Y_test_tgc = test_y_list[0]
    Y_test_m = test_y_list[1]
    Y_test_fgc = test_y_list[2]
    Y_test_f = test_y_list[3]
    Y_test_cp = test_y_list[4]
    Y_test_c = test_y_list[5]
    Y_test_a = test_y_list[6]
    Y_test_ag = test_y_list[7]
    Y_test_g = test_y_list[8]

    external_Y_test1_tgc = external_test_y1_list[0]
    external_Y_test1_m = external_test_y1_list[1]
    external_Y_test1_fgc = external_test_y1_list[2]
    external_Y_test1_f = external_test_y1_list[3]
    external_Y_test1_cp = external_test_y1_list[4]
    external_Y_test1_c = external_test_y1_list[5]
    external_Y_test1_a = external_test_y1_list[6]
    external_Y_test1_ag = external_test_y1_list[7]
    external_Y_test1_g = external_test_y1_list[8]

    feature_size = X_train.shape[1]

    Y_train =[Y_train_tgc,Y_train_m,Y_train_fgc,Y_train_f,Y_train_cp,Y_train_c,Y_train_a,Y_train_ag,Y_train_g]
    Y_val =[Y_val_tgc,Y_val_m,Y_val_fgc,Y_val_f,Y_val_cp,Y_val_c,Y_val_a,Y_val_ag,Y_val_g]
    Y_test =[Y_test_tgc,Y_test_m,Y_test_fgc,Y_test_f,Y_test_cp,Y_test_c,Y_test_a,Y_test_ag,Y_test_g]
    external_Y_test1 =[external_Y_test1_tgc,external_Y_test1_m,external_Y_test1_fgc,external_Y_test1_f,external_Y_test1_cp,external_Y_test1_c,external_Y_test1_a,external_Y_test1_ag,external_Y_test1_g]

    performance_result_df.loc[count,'train_length'] = len(X_train)
    performance_result_df.loc[count,'valid_length'] = len(X_val)
    performance_result_df.loc[count,'test_length'] = len(X_test)

    tmp_train_X = []
    tmp_val_X = []
    tmp_test_X = []
    tmp_external_X1 = []
    tmp_train_y = []
    tmp_val_y = []
    tmp_test_y = []
    tmp_external_y1 = []

    for i,(train, val, test, external_test1) in enumerate(zip(Y_train, Y_val, Y_test, external_Y_test1)):
        tmp_X, tmp_y = nan_del(X_train, train)
        tmp_train_X.append(tmp_X)
        tmp_train_y.append(tmp_y)

        tmp_X, tmp_y = nan_del(X_val, val)
        tmp_val_X.append(tmp_X)
        tmp_val_y.append(tmp_y)

        tmp_X, tmp_y = nan_del(X_test, test)
        tmp_test_X.append(tmp_X)
        tmp_test_y.append(tmp_y)

        tmp_X, tmp_y = nan_del(external_test_X1, external_test1)
        tmp_external_X1.append(tmp_X)
        tmp_external_y1.append(tmp_y)
    
    for i in range(len(target_name)):
        tmp_val_train_X = np.concatenate([tmp_train_X[i], tmp_val_X[i]],axis=0)
        tmp_val_train_y = np.concatenate([tmp_train_y[i], tmp_val_y[i]])

        # LGBM
        internal_performance, external_performance1, model_info = LGBM_tuning_predict(tmp_val_train_X, tmp_val_train_y, tmp_test_X[i], tmp_test_y[i], tmp_external_X1[i], tmp_external_y1[i])
        filename = 'Model save path in .pkl format'
        dump(model_info, open(filename, 'wb'))
        for j, performance_name in enumerate(perform):
            target_feature = target_name[i]+'_'+performance_name
            performance_result_df.loc[count, target_feature] = internal_performance[j]
            performance_external_result_df1.loc[count, target_feature] = external_performance1[j]

        # CatBoost
        internal_performance, external_performance1, model_info = Cat_tuning_predict(tmp_val_train_X, tmp_val_train_y, tmp_test_X[i], tmp_test_y[i], tmp_external_X1[i], tmp_external_y1[i])
        filename = 'Model save path in .pkl format'
        dump(model_info, open(filename, 'wb'))
        for j, performance_name in enumerate(perform):
            target_feature = target_name[i]+'_'+performance_name
            performance_result_df.loc[count+1, target_feature] = internal_performance[j]
            performance_external_result_df1.loc[count, target_feature] = external_performance1[j]

        # XGB
        internal_performance, external_performance1, model_info = XGB_tuning_predict(tmp_val_train_X, tmp_val_train_y, tmp_test_X[i], tmp_test_y[i], tmp_external_X1[i], tmp_external_y1[i])
        filename = 'Model save path in .pkl format'
        dump(model_info, open(filename, 'wb'))
        for j, performance_name in enumerate(perform):
            target_feature = target_name[i]+'_'+performance_name
            performance_result_df.loc[count+2, target_feature] = internal_performance[j]
            performance_external_result_df1.loc[count, target_feature] = external_performance1[j]

        # LR
        internal_performance, external_performance1, model_info = LR_tuning_predict(tmp_val_train_X, tmp_val_train_y, tmp_test_X[i], tmp_test_y[i], tmp_external_X1[i], tmp_external_y1[i])
        filename = 'Model save path in .pkl format'
        dump(model_info, open(filename, 'wb'))
        for j, performance_name in enumerate(perform):
            target_feature = target_name[i]+'_'+performance_name
            performance_result_df.loc[count+3, target_feature] = internal_performance[j]
            performance_external_result_df1.loc[count, target_feature] = external_performance1[j]
    count+=4

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=100; total time=   0.3s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.6s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=200; total time=   0.5s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.7s
[CV] END .....................max_depth=15, n_es


KeyboardInterrupt



In [None]:
performance_result_df.to_csv('model internal test performance file save path')
performance_external_result_df1.to_csv('model external test performance file save path')