In [1]:
import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']


import os, datetime
import tempfile

import seaborn as sns

os.environ["CUDA_VISIBLE_DEVICES"]="4"
import tensorflow as tf
from tensorflow import keras



from sklearn.mixture import GaussianMixture

from sklearn.metrics import roc_auc_score

#path = '/data/datasets/topcat/data/csv_data/'
path = '/data/datasets/topcat/data/sas_data/'
pathOutcomes = '/data/datasets/topcat/data/Outcomes/'

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

skf = StratifiedKFold(n_splits=5, shuffle=False)

from collections import defaultdict

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
outcome_cols = [
    'death', 'cvd_death', 'time_death', 'anyhosp', 'time_anyhosp',
    'hfhosp', 'time_hfhosp', 'abortedca', 'time_abortedca', 'mi',
    'time_mi', 'stroke', 'time_stroke', 'primary_ep', 'time_primary_ep'
]

In [3]:
con_cat_cols = [
    'GLUCOSE_FAST', 'GLUCOSE_RAND', 'CO2_mmolL', 'GLUCOSE_mgdL','WBC_kuL',
    'HCT_p', 'HB_gdL', 'PLT_kuL', 'ALP_UL', 'TBILI_mgdL', 'ALB_gdL'
]

In [4]:
contin_cols = [
    'BNP_VAL', 'age_entry', 'EF', 'visit_dt1_hf', 'chfdc_dt3', 'mi_dt3',
    'stroke_dt3', 'cabg_dt3', 'pci_dt3', 'DM_AGE_YR', 'DM_DUR_YR', 'cigs',
    'SMOKE_YRS', 'QUIT_YRS', 'HEAVY_MIN', 'HEAVY_WK', 'MED_WK', 'MED_MIN',
    'LIGHT_WK', 'LIGHT_MIN', 'metsperweek', 'cooking_salt_score', 'height',
    'weight', 'waistc', 'HR', 'SBP', 'DBP', 'CR_mgdl', 'gfr', 'labs_dt1',
    'NA_mmolL', 'K_mmolL', 'CL_mmolL', 'BUN_mgdL', 'ALT_UL', 'AST_UL',
    'urine_val_mgg', 'QRS_DUR', 'CR_mgdL', 'BMI'
]

In [5]:
priep = pd.read_csv('/data/datasets/topcat/nch/nn_baseline/primary_ep_set.csv', index_col=0)
death = pd.read_csv('/data/datasets/topcat/nch/nn_baseline/death_set.csv'     , index_col=0)
hfhos = pd.read_csv('/data/datasets/topcat/nch/nn_baseline/hfhosp_set.csv'    , index_col=0)

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=False)

In [7]:
mode = 2

In [8]:
if mode == 1:
    outcome = 'primary_ep'
    outcome_time = 'time_primary_ep'
    df = priep.copy()
elif mode == 2:
    outcome = 'death'
    outcome_time = 'time_death'
    df = death.copy()
elif mode == 3:
    outcome = 'hfhosp'
    outcome_time = 'time_hfhosp'
    df = hfhos.copy()

labels = df[outcome].copy()
complete_labels = labels.copy()

In [9]:
METRICS = [
  keras.metrics.TruePositives(name='tp'),
  keras.metrics.FalsePositives(name='fp'),
  keras.metrics.TrueNegatives(name='tn'),
  keras.metrics.FalseNegatives(name='fn'), 
  keras.metrics.BinaryAccuracy(name='accuracy'),
  keras.metrics.Precision(name='precision'),
  keras.metrics.Recall(name='recall'),
  keras.metrics.AUC(name='auc'),
]

In [10]:
def make_model(metrics = METRICS, output_bias=None, hl_count=1, hl_size=16, dropout=0.5):
        if output_bias is not None:
            output_bias = tf.keras.initializers.Constant(output_bias)
        model = keras.Sequential()
        model.add(keras.layers.Dense(
                hl_size, activation='relu',
                input_shape=(train_data.shape[-1],)))
        model.add(keras.layers.Dropout(dropout))
        
        for i in range(hl_count-1):
            model.add(
                keras.layers.Dense(hl_size, activation='relu'))
            model.add(keras.layers.Dropout(dropout))
        
        model.add(keras.layers.Dense(1, activation='sigmoid',
                               bias_initializer=output_bias)
        )
        model.compile(
            optimizer=keras.optimizers.Adam(lr=1e-3),
            loss=keras.losses.BinaryCrossentropy(),
            metrics=metrics)
        return model

In [11]:
EPOCHS = 2000
BATCH_SIZE = 64

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=0,
    patience=200,
    mode='max',
    restore_best_weights=True)


## Base Test

In [12]:
auc_df = pd.DataFrame(index=range(5))

for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 05:29:27 PM  



XGB AUC=0.723	NNs:
RF AUC=0.745
NNs:
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
size6=> AUC=0.705 ( 816 epochs)   size16=> AUC=0.666 ( 330 epochs)   size27=> AUC=0.660 ( 341 epochs)   size38=> AUC=0.635 ( 342 epochs)   size49=> AUC=0.624 ( 292 epochs)   size60=> AUC=0.656 ( 264 epochs)   
	2 HL(s): size6=> AUC=0.676 ( 651 epochs)   size16=> AUC=0.659 ( 546 epochs)   size27=> AUC=0.644 ( 429 epochs)   size38=> AUC=0.662 ( 407 epochs)   size49=> AUC=0.621 ( 308 epochs)   size60=> AUC=0.608 ( 297 epochs)   
	3 HL(s): size6=> AUC=0.519 ( 228 epochs)   size16=> AUC=0.577 ( 374 epochs)   size27=> AUC=0.604 ( 319 epochs)   size38=> AUC=0.579 ( 330 epochs)   size49=> AUC=0.637 ( 330 epochs)   size60=> AUC=0.618 ( 370 epochs)   
	4 HL(s): size6=> AUC=0.513 ( 216 epochs)   size16=> AUC=0.666 ( 660 epochs)   size27=> AUC=0.619 ( 3



XGB AUC=0.723	NNs:
RF AUC=0.763
NNs:
	1 HL(s): size6=> AUC=0.671 ( 324 epochs)   size16=> AUC=0.698 ( 383 epochs)   size27=> AUC=0.709 ( 338 epochs)   size38=> AUC=0.705 ( 319 epochs)   size49=> AUC=0.723 ( 302 epochs)   size60=> AUC=0.699 ( 316 epochs)   
	2 HL(s): size6=> AUC=0.513 ( 201 epochs)   size16=> AUC=0.674 ( 371 epochs)   size27=> AUC=0.691 ( 333 epochs)   size38=> AUC=0.653 ( 299 epochs)   size49=> AUC=0.675 ( 277 epochs)   size60=> AUC=0.684 ( 283 epochs)   
	3 HL(s): size6=> AUC=0.498 ( 247 epochs)   size16=> AUC=0.644 ( 521 epochs)   size27=> AUC=0.629 ( 384 epochs)   size38=> AUC=0.628 ( 352 epochs)   size49=> AUC=0.643 ( 370 epochs)   size60=> AUC=0.671 ( 311 epochs)   
	4 HL(s): size6=> AUC=0.525 ( 211 epochs)   size16=> AUC=0.661 ( 654 epochs)   size27=> AUC=0.525 ( 205 epochs)   size38=> AUC=0.625 ( 339 epochs)   size49=> AUC=0.613 ( 354 epochs)   size60=> AUC=0.619 ( 366 epochs)   
Fold 2: 06:01:41 PM  



XGB AUC=0.706	NNs:
RF AUC=0.759
NNs:
	1 HL(s): size6=> AUC=0.697 ( 345 epochs)   size16=> AUC=0.702 ( 927 epochs)   size27=> AUC=0.708 ( 346 epochs)   size38=> AUC=0.671 ( 499 epochs)   size49=> AUC=0.694 ( 302 epochs)   size60=> AUC=0.696 ( 298 epochs)   
	2 HL(s): size6=> AUC=0.653 ( 742 epochs)   size16=> AUC=0.652 ( 535 epochs)   size27=> AUC=0.683 ( 402 epochs)   size38=> AUC=0.682 ( 309 epochs)   size49=> AUC=0.682 ( 294 epochs)   size60=> AUC=0.626 ( 325 epochs)   
	3 HL(s): size6=> AUC=0.680 ( 488 epochs)   size16=> AUC=0.627 ( 503 epochs)   size27=> AUC=0.637 ( 377 epochs)   size38=> AUC=0.597 ( 435 epochs)   size49=> AUC=0.632 ( 340 epochs)   size60=> AUC=0.650 ( 362 epochs)   
	4 HL(s): size6=> AUC=0.503 ( 208 epochs)   size16=> AUC=0.607 ( 509 epochs)   size27=> AUC=0.607 ( 418 epochs)   size38=> AUC=0.624 ( 373 epochs)   size49=> AUC=0.642 ( 359 epochs)   size60=> AUC=0.647 ( 431 epochs)   
Fold 3: 06:34:35 PM  



XGB AUC=0.716	NNs:
RF AUC=0.686
NNs:
	1 HL(s): size6=> AUC=0.657 ( 294 epochs)   size16=> AUC=0.664 ( 397 epochs)   size27=> AUC=0.678 ( 337 epochs)   size38=> AUC=0.649 ( 306 epochs)   size49=> AUC=0.650 ( 323 epochs)   size60=> AUC=0.639 ( 291 epochs)   
	2 HL(s): size6=> AUC=0.630 ( 535 epochs)   size16=> AUC=0.683 ( 376 epochs)   size27=> AUC=0.624 ( 284 epochs)   size38=> AUC=0.630 ( 297 epochs)   size49=> AUC=0.632 ( 297 epochs)   size60=> AUC=0.616 ( 275 epochs)   
	3 HL(s): size6=> AUC=0.503 ( 212 epochs)   size16=> AUC=0.653 ( 503 epochs)   size27=> AUC=0.643 ( 343 epochs)   size38=> AUC=0.650 ( 425 epochs)   size49=> AUC=0.612 ( 352 epochs)   size60=> AUC=0.594 ( 393 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 213 epochs)   size16=> AUC=0.588 ( 453 epochs)   size27=> AUC=0.624 ( 365 epochs)   size38=> AUC=0.638 ( 348 epochs)   size49=> AUC=0.593 ( 345 epochs)   size60=> AUC=0.603 ( 304 epochs)   
Fold 4: 07:10:20 PM  



XGB AUC=0.673	NNs:
RF AUC=0.685
NNs:
	1 HL(s): size6=> AUC=0.648 ( 358 epochs)   size16=> AUC=0.662 ( 331 epochs)   size27=> AUC=0.656 ( 293 epochs)   size38=> AUC=0.696 ( 283 epochs)   size49=> AUC=0.665 ( 285 epochs)   size60=> AUC=0.674 ( 295 epochs)   
	2 HL(s): size6=> AUC=0.503 ( 247 epochs)   size16=> AUC=0.661 ( 481 epochs)   size27=> AUC=0.661 ( 336 epochs)   size38=> AUC=0.663 ( 320 epochs)   size49=> AUC=0.675 ( 367 epochs)   size60=> AUC=0.672 ( 255 epochs)   
	3 HL(s): size6=> AUC=0.536 ( 206 epochs)   size16=> AUC=0.650 ( 389 epochs)   size27=> AUC=0.678 ( 378 epochs)   size38=> AUC=0.615 ( 320 epochs)   size49=> AUC=0.683 ( 811 epochs)   size60=> AUC=0.669 ( 357 epochs)   
	4 HL(s): size6=> AUC=0.501 ( 219 epochs)   size16=> AUC=0.462 ( 231 epochs)   size27=> AUC=0.681 ( 367 epochs)   size38=> AUC=0.655 ( 325 epochs)   size49=> AUC=0.628 ( 371 epochs)   size60=> AUC=0.636 ( 309 epochs)   


In [13]:
auc_df

Unnamed: 0,xgb,rf,1_6,1_16,1_27,1_38,1_49,1_60,2_6,2_16,...,3_27,3_38,3_49,3_60,4_6,4_16,4_27,4_38,4_49,4_60
0,0.72299,0.745161,0.704831,0.666094,0.659924,0.635322,0.624184,0.655749,0.676154,0.659225,...,0.60448,0.579489,0.636884,0.617966,0.513198,0.666044,0.618758,0.653061,0.65839,0.6182
1,0.7234,0.76339,0.670584,0.697772,0.709099,0.704773,0.722849,0.698878,0.512824,0.673532,...,0.629316,0.628031,0.643108,0.671422,0.525262,0.660613,0.52521,0.625148,0.612534,0.61884
2,0.706369,0.758737,0.696963,0.702016,0.70807,0.671457,0.694328,0.695606,0.653457,0.652058,...,0.636811,0.597184,0.631944,0.64954,0.502518,0.606763,0.607427,0.623734,0.642278,0.647226
3,0.716129,0.685744,0.657045,0.664172,0.677958,0.649218,0.649572,0.639214,0.629906,0.682897,...,0.643099,0.64958,0.61175,0.593688,0.5,0.588022,0.624256,0.638434,0.593176,0.603035
4,0.672945,0.684599,0.647849,0.662073,0.656213,0.695544,0.665302,0.674482,0.503366,0.661257,...,0.677719,0.614941,0.683148,0.668512,0.500581,0.462446,0.680957,0.65507,0.627833,0.635788


In [14]:
auc_scores = auc_df.T
auc_scores['lower_ci'] = auc_df.T.mean(axis=1) - 1.96*auc_df.T.std(axis=1)/auc_df.T.count(axis=1)
auc_scores['mean'] = auc_df.T.mean(axis=1)
auc_scores['upper_ci'] = auc_df.T.mean(axis=1) + 1.96*auc_df.T.std(axis=1)/auc_df.T.count(axis=1)
#auc_scores.sort_values(by='mean', ascending=False)
auc_scores

Unnamed: 0,0,1,2,3,4,lower_ci,mean,upper_ci
xgb,0.72299,0.7234,0.706369,0.716129,0.672945,0.700147,0.708367,0.716586
rf,0.745161,0.76339,0.758737,0.685744,0.684599,0.712143,0.727526,0.742909
1_6,0.704831,0.670584,0.696963,0.657045,0.647849,0.665752,0.675454,0.685156
1_16,0.666094,0.697772,0.702016,0.664172,0.662073,0.6707,0.678425,0.68615
1_27,0.659924,0.709099,0.70807,0.677958,0.656213,0.672293,0.682253,0.692213
1_38,0.635322,0.704773,0.671457,0.649218,0.695544,0.659683,0.671263,0.682842
1_49,0.624184,0.722849,0.694328,0.649572,0.665302,0.656172,0.671247,0.686322
1_60,0.655749,0.698878,0.695606,0.639214,0.674482,0.66275,0.672786,0.682822
2_6,0.676154,0.512824,0.653457,0.629906,0.503366,0.563312,0.595141,0.62697
2_16,0.659225,0.673532,0.652058,0.682897,0.661257,0.660973,0.665794,0.670614


In [13]:
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
oversample = SMOTE()
ran_undersample = RandomUnderSampler()

## SMOTE on Entire Dataset

In [19]:
auc_df = pd.DataFrame(index=range(5))
#---------------------------------------------------------------------------
df = df.fillna(df.mean()) # oversample cant take NaN values
# Oversampling the minority class in the entire dataset
df, labels = oversample.fit_resample(df, labels)
#---------------------------------------------------------------------------
for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 08:25:33 PM  



XGB AUC=0.774	NNs:
RF AUC=0.816
NNs:
	1 HL(s): size6=> AUC=0.760 ( 435 epochs)   size16=> AUC=0.776 ( 407 epochs)   size27=> AUC=0.829 ( 785 epochs)   size38=> AUC=0.824 ( 904 epochs)   size49=> AUC=0.832 ( 848 epochs)   size60=> AUC=0.832 ( 961 epochs)   
	2 HL(s): size6=> AUC=0.763 (1184 epochs)   size16=> AUC=0.776 ( 773 epochs)   size27=> AUC=0.802 ( 855 epochs)   size38=> AUC=0.813 ( 704 epochs)   size49=> AUC=0.811 ( 642 epochs)   size60=> AUC=0.813 ( 779 epochs)   
	3 HL(s): size6=> AUC=0.502 ( 201 epochs)   size16=> AUC=0.784 ( 410 epochs)   size27=> AUC=0.770 ( 659 epochs)   size38=> AUC=0.728 ( 465 epochs)   size49=> AUC=0.764 ( 628 epochs)   size60=> AUC=0.741 ( 776 epochs)   
	4 HL(s): size6=> AUC=0.487 ( 207 epochs)   size16=> AUC=0.710 ( 498 epochs)   size27=> AUC=0.736 ( 378 epochs)   size38=> AUC=0.750 ( 460 epochs)   size49=> AUC=0.742 ( 450 epochs)   size60=> AUC=0.766 ( 699 epochs)   
Fold 1: 09:50:01 PM  



XGB AUC=0.747	NNs:
RF AUC=0.817
NNs:
	1 HL(s): size6=> AUC=0.734 ( 452 epochs)   size16=> AUC=0.806 ( 783 epochs)   size27=> AUC=0.831 ( 974 epochs)   size38=> AUC=0.825 ( 704 epochs)   size49=> AUC=0.844 ( 896 epochs)   size60=> AUC=0.846 (1067 epochs)   
	2 HL(s): size6=> AUC=0.765 (1126 epochs)   size16=> AUC=0.770 ( 647 epochs)   size27=> AUC=0.794 ( 763 epochs)   size38=> AUC=0.800 ( 832 epochs)   size49=> AUC=0.792 ( 809 epochs)   size60=> AUC=0.821 ( 779 epochs)   
	3 HL(s): size6=> AUC=0.763 ( 774 epochs)   size16=> AUC=0.756 ( 626 epochs)   size27=> AUC=0.763 ( 426 epochs)   size38=> AUC=0.803 ( 561 epochs)   size49=> AUC=0.828 ( 554 epochs)   size60=> AUC=0.781 ( 395 epochs)   
	4 HL(s): size6=> AUC=0.684 ( 706 epochs)   size16=> AUC=0.751 ( 411 epochs)   size27=> AUC=0.745 ( 547 epochs)   size38=> AUC=0.766 ( 614 epochs)   size49=> AUC=0.798 ( 434 epochs)   size60=> AUC=0.789 ( 404 epochs)   
Fold 2: 11:29:19 PM  



XGB AUC=0.994	NNs:
RF AUC=0.997
NNs:
	1 HL(s): size6=> AUC=0.792 ( 819 epochs)   size16=> AUC=0.857 ( 761 epochs)   size27=> AUC=0.882 ( 981 epochs)   size38=> AUC=0.883 ( 647 epochs)   size49=> AUC=0.884 ( 506 epochs)   size60=> AUC=0.922 ( 850 epochs)   
	2 HL(s): size6=> AUC=0.767 ( 517 epochs)   size16=> AUC=0.803 ( 926 epochs)   size27=> AUC=0.853 (1308 epochs)   size38=> AUC=0.891 ( 774 epochs)   size49=> AUC=0.877 (1070 epochs)   size60=> AUC=0.877 ( 738 epochs)   
	3 HL(s): size6=> AUC=0.666 ( 382 epochs)   size16=> AUC=0.761 ( 859 epochs)   size27=> AUC=0.833 (1285 epochs)   size38=> AUC=0.899 ( 734 epochs)   size49=> AUC=0.875 ( 870 epochs)   size60=> AUC=0.871 ( 717 epochs)   
	4 HL(s): size6=> AUC=0.596 ( 916 epochs)   size16=> AUC=0.838 (1399 epochs)   size27=> AUC=0.741 ( 443 epochs)   size38=> AUC=0.873 ( 545 epochs)   size49=> AUC=0.867 ( 465 epochs)   size60=> AUC=0.903 ( 965 epochs)   
Fold 3: 01:38:33 AM  



XGB AUC=0.993	NNs:
RF AUC=0.999
NNs:
	1 HL(s): size6=> AUC=0.771 ( 717 epochs)   size16=> AUC=0.899 ( 830 epochs)   size27=> AUC=0.877 ( 478 epochs)   size38=> AUC=0.895 ( 614 epochs)   size49=> AUC=0.942 ( 773 epochs)   size60=> AUC=0.930 (1150 epochs)   
	2 HL(s): size6=> AUC=0.631 ( 632 epochs)   size16=> AUC=0.826 ( 789 epochs)   size27=> AUC=0.902 ( 591 epochs)   size38=> AUC=0.926 ( 493 epochs)   size49=> AUC=0.932 ( 656 epochs)   size60=> AUC=0.918 ( 658 epochs)   
	3 HL(s): size6=> AUC=0.668 ( 386 epochs)   size16=> AUC=0.840 (1191 epochs)   size27=> AUC=0.834 ( 766 epochs)   size38=> AUC=0.839 ( 468 epochs)   size49=> AUC=0.875 ( 838 epochs)   size60=> AUC=0.909 ( 791 epochs)   
	4 HL(s): size6=> AUC=0.742 ( 491 epochs)   size16=> AUC=0.788 ( 732 epochs)   size27=> AUC=0.853 ( 679 epochs)   size38=> AUC=0.866 ( 727 epochs)   size49=> AUC=0.919 ( 695 epochs)   size60=> AUC=0.883 ( 723 epochs)   
Fold 4: 03:41:46 AM  



XGB AUC=0.996	NNs:
RF AUC=0.999
NNs:
	1 HL(s): size6=> AUC=0.829 (1001 epochs)   size16=> AUC=0.838 ( 755 epochs)   size27=> AUC=0.873 (1021 epochs)   size38=> AUC=0.887 ( 617 epochs)   size49=> AUC=0.918 ( 927 epochs)   size60=> AUC=0.931 ( 734 epochs)   
	2 HL(s): size6=> AUC=0.764 ( 748 epochs)   size16=> AUC=0.847 ( 654 epochs)   size27=> AUC=0.875 ( 652 epochs)   size38=> AUC=0.918 ( 933 epochs)   size49=> AUC=0.895 ( 653 epochs)   size60=> AUC=0.923 (1001 epochs)   
	3 HL(s): size6=> AUC=0.784 ( 940 epochs)   size16=> AUC=0.820 (1062 epochs)   size27=> AUC=0.849 (1387 epochs)   size38=> AUC=0.853 ( 664 epochs)   size49=> AUC=0.877 ( 707 epochs)   size60=> AUC=0.908 ( 879 epochs)   
	4 HL(s): size6=> AUC=0.533 ( 211 epochs)   size16=> AUC=0.829 ( 698 epochs)   size27=> AUC=0.874 (1410 epochs)   size38=> AUC=0.883 ( 833 epochs)   size49=> AUC=0.854 (1029 epochs)   size60=> AUC=0.865 ( 430 epochs)   


## SMOTE and Random Undersampling on Entire Dataset

In [20]:
auc_df = pd.DataFrame(index=range(5))
#---------------------------------------------------------------------------
df = df.fillna(df.mean()) # oversample cant take NaN values
# Oversampling the minority class
df, labels = oversample.fit_resample(df, labels)
# Undersampling the majority class 
df, labels = ran_undersample.fit_resample(df, labels)
#---------------------------------------------------------------------------
for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 05:25:19 AM  



XGB AUC=0.918	NNs:
RF AUC=0.925
NNs:
	1 HL(s): size6=> AUC=0.761 ( 845 epochs)   size16=> AUC=0.805 ( 456 epochs)   size27=> AUC=0.844 (1173 epochs)   size38=> AUC=0.846 ( 917 epochs)   size49=> AUC=0.863 (1201 epochs)   size60=> AUC=0.875 (1186 epochs)   
	2 HL(s): size6=> AUC=0.699 ( 500 epochs)   size16=> AUC=0.816 ( 657 epochs)   size27=> AUC=0.821 ( 441 epochs)   size38=> AUC=0.836 ( 724 epochs)   size49=> AUC=0.888 (1506 epochs)   size60=> AUC=0.868 ( 515 epochs)   
	3 HL(s): size6=> AUC=0.498 ( 213 epochs)   size16=> AUC=0.772 ( 797 epochs)   size27=> AUC=0.809 ( 437 epochs)   size38=> AUC=0.841 (1043 epochs)   size49=> AUC=0.818 ( 657 epochs)   size60=> AUC=0.855 ( 657 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 208 epochs)   size16=> AUC=0.731 ( 344 epochs)   size27=> AUC=0.827 ( 738 epochs)   size38=> AUC=0.785 ( 993 epochs)   size49=> AUC=0.803 ( 708 epochs)   size60=> AUC=0.858 (1046 epochs)   
Fold 1: 07:10:58 AM  



XGB AUC=0.910	NNs:
RF AUC=0.918
NNs:
	1 HL(s): size6=> AUC=0.765 ( 787 epochs)   size16=> AUC=0.818 ( 588 epochs)   size27=> AUC=0.849 (1021 epochs)   size38=> AUC=0.851 ( 734 epochs)   size49=> AUC=0.842 ( 641 epochs)   size60=> AUC=0.863 ( 830 epochs)   
	2 HL(s): size6=> AUC=0.727 ( 848 epochs)   size16=> AUC=0.789 ( 807 epochs)   size27=> AUC=0.835 (1387 epochs)   size38=> AUC=0.846 ( 450 epochs)   size49=> AUC=0.839 ( 768 epochs)   size60=> AUC=0.849 (1265 epochs)   
	3 HL(s): size6=> AUC=0.727 ( 649 epochs)   size16=> AUC=0.778 ( 468 epochs)   size27=> AUC=0.788 ( 510 epochs)   size38=> AUC=0.843 ( 669 epochs)   size49=> AUC=0.839 ( 632 epochs)   size60=> AUC=0.841 (1177 epochs)   
	4 HL(s): size6=> AUC=0.501 ( 204 epochs)   size16=> AUC=0.752 ( 756 epochs)   size27=> AUC=0.801 ( 849 epochs)   size38=> AUC=0.835 ( 595 epochs)   size49=> AUC=0.721 ( 898 epochs)   size60=> AUC=0.842 ( 815 epochs)   
Fold 2: 09:11:17 AM  



XGB AUC=0.858	NNs:
RF AUC=0.889
NNs:
	1 HL(s): size6=> AUC=0.796 ( 670 epochs)   size16=> AUC=0.804 ( 756 epochs)   size27=> AUC=0.837 ( 979 epochs)   size38=> AUC=0.835 ( 479 epochs)   size49=> AUC=0.830 ( 641 epochs)   size60=> AUC=0.848 ( 604 epochs)   
	2 HL(s): size6=> AUC=0.759 ( 782 epochs)   size16=> AUC=0.806 ( 678 epochs)   size27=> AUC=0.783 ( 426 epochs)   size38=> AUC=0.855 ( 872 epochs)   size49=> AUC=0.854 ( 967 epochs)   size60=> AUC=0.833 ( 828 epochs)   
	3 HL(s): size6=> AUC=0.731 ( 942 epochs)   size16=> AUC=0.781 (1014 epochs)   size27=> AUC=0.755 ( 507 epochs)   size38=> AUC=0.803 ( 492 epochs)   size49=> AUC=0.820 ( 709 epochs)   size60=> AUC=0.839 ( 636 epochs)   
	4 HL(s): size6=> AUC=0.507 ( 205 epochs)   size16=> AUC=0.783 ( 472 epochs)   size27=> AUC=0.771 ( 508 epochs)   size38=> AUC=0.797 ( 477 epochs)   size49=> AUC=0.805 ( 509 epochs)   size60=> AUC=0.807 ( 877 epochs)   
Fold 3: 11:14:48 AM  



XGB AUC=0.903	NNs:
RF AUC=0.908
NNs:
	1 HL(s): size6=> AUC=0.747 ( 541 epochs)   size16=> AUC=0.779 ( 366 epochs)   size27=> AUC=0.829 ( 727 epochs)   size38=> AUC=0.843 ( 808 epochs)   size49=> AUC=0.891 (1179 epochs)   size60=> AUC=0.884 ( 913 epochs)   
	2 HL(s): size6=> AUC=0.757 ( 912 epochs)   size16=> AUC=0.814 ( 870 epochs)   size27=> AUC=0.847 ( 777 epochs)   size38=> AUC=0.843 ( 730 epochs)   size49=> AUC=0.876 (1005 epochs)   size60=> AUC=0.871 ( 694 epochs)   
	3 HL(s): size6=> AUC=0.727 ( 927 epochs)   size16=> AUC=0.788 ( 845 epochs)   size27=> AUC=0.835 ( 698 epochs)   size38=> AUC=0.847 ( 556 epochs)   size49=> AUC=0.838 ( 839 epochs)   size60=> AUC=0.843 (1122 epochs)   
	4 HL(s): size6=> AUC=0.503 ( 204 epochs)   size16=> AUC=0.728 ( 823 epochs)   size27=> AUC=0.812 (1128 epochs)   size38=> AUC=0.724 (1039 epochs)   size49=> AUC=0.822 ( 649 epochs)   size60=> AUC=0.837 (1003 epochs)   
Fold 4: 01:54:55 PM  



XGB AUC=0.907	NNs:
RF AUC=0.926
NNs:
	1 HL(s): size6=> AUC=0.779 ( 644 epochs)   size16=> AUC=0.847 ( 730 epochs)   size27=> AUC=0.855 (1663 epochs)   size38=> AUC=0.864 (1091 epochs)   size49=> AUC=0.883 (1298 epochs)   size60=> AUC=0.875 (1456 epochs)   
	2 HL(s): size6=> AUC=0.739 ( 417 epochs)   size16=> AUC=0.788 ( 496 epochs)   size27=> AUC=0.808 ( 499 epochs)   size38=> AUC=0.848 ( 626 epochs)   size49=> AUC=0.831 ( 691 epochs)   size60=> AUC=0.876 ( 770 epochs)   
	3 HL(s): size6=> AUC=0.500 ( 216 epochs)   size16=> AUC=0.790 ( 835 epochs)   size27=> AUC=0.797 ( 567 epochs)   size38=> AUC=0.835 ( 983 epochs)   size49=> AUC=0.835 ( 810 epochs)   size60=> AUC=0.852 (1128 epochs)   
	4 HL(s): size6=> AUC=0.651 ( 520 epochs)   size16=> AUC=0.709 ( 564 epochs)   size27=> AUC=0.806 ( 815 epochs)   size38=> AUC=0.826 ( 985 epochs)   size49=> AUC=0.828 ( 430 epochs)   size60=> AUC=0.820 ( 691 epochs)   


## SMOTE on each K-fold group training data

In [21]:
auc_df = pd.DataFrame(index=range(5))

for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    #---------------------------------------------------------------------------
    train_data, train_labels = oversample.fit_resample(train_data, train_labels)
    #---------------------------------------------------------------------------
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 04:35:32 PM  



XGB AUC=0.919	NNs:
RF AUC=0.927
NNs:
	1 HL(s): size6=> AUC=0.787 ( 655 epochs)   size16=> AUC=0.769 ( 367 epochs)   size27=> AUC=0.864 ( 922 epochs)   size38=> AUC=0.828 ( 654 epochs)   size49=> AUC=0.855 ( 807 epochs)   size60=> AUC=0.871 ( 876 epochs)   
	2 HL(s): size6=> AUC=0.688 ( 367 epochs)   size16=> AUC=0.798 (1124 epochs)   size27=> AUC=0.810 ( 904 epochs)   size38=> AUC=0.841 ( 588 epochs)   size49=> AUC=0.844 ( 632 epochs)   size60=> AUC=0.871 ( 522 epochs)   
	3 HL(s): size6=> AUC=0.772 ( 720 epochs)   size16=> AUC=0.760 ( 444 epochs)   size27=> AUC=0.847 ( 934 epochs)   size38=> AUC=0.838 ( 540 epochs)   size49=> AUC=0.845 ( 520 epochs)   size60=> AUC=0.866 ( 603 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 257 epochs)   size16=> AUC=0.793 ( 941 epochs)   size27=> AUC=0.753 ( 477 epochs)   size38=> AUC=0.793 ( 527 epochs)   size49=> AUC=0.760 ( 588 epochs)   size60=> AUC=0.838 ( 878 epochs)   
Fold 1: 06:52:10 PM  



XGB AUC=0.910	NNs:
RF AUC=0.925
NNs:
	1 HL(s): size6=> AUC=0.790 (1063 epochs)   size16=> AUC=0.816 ( 676 epochs)   size27=> AUC=0.824 ( 781 epochs)   size38=> AUC=0.842 ( 908 epochs)   size49=> AUC=0.863 ( 772 epochs)   size60=> AUC=0.837 (1127 epochs)   
	2 HL(s): size6=> AUC=0.760 ( 682 epochs)   size16=> AUC=0.785 ( 873 epochs)   size27=> AUC=0.824 (1035 epochs)   size38=> AUC=0.826 ( 904 epochs)   size49=> AUC=0.834 ( 905 epochs)   size60=> AUC=0.877 (1110 epochs)   
	3 HL(s): size6=> AUC=0.503 ( 201 epochs)   size16=> AUC=0.745 (1433 epochs)   size27=> AUC=0.808 ( 722 epochs)   size38=> AUC=0.825 ( 986 epochs)   size49=> AUC=0.817 ( 665 epochs)   size60=> AUC=0.814 (1076 epochs)   
	4 HL(s): size6=> AUC=0.502 ( 204 epochs)   size16=> AUC=0.690 ( 419 epochs)   size27=> AUC=0.790 ( 422 epochs)   size38=> AUC=0.826 ( 510 epochs)   size49=> AUC=0.843 ( 802 epochs)   size60=> AUC=0.828 ( 455 epochs)   
Fold 2: 09:37:56 PM  



XGB AUC=0.858	NNs:
RF AUC=0.895
NNs:
	1 HL(s): size6=> AUC=0.746 ( 811 epochs)   size16=> AUC=0.797 ( 458 epochs)   size27=> AUC=0.850 (1071 epochs)   size38=> AUC=0.866 ( 790 epochs)   size49=> AUC=0.837 ( 781 epochs)   size60=> AUC=0.857 ( 968 epochs)   
	2 HL(s): size6=> AUC=0.718 ( 511 epochs)   size16=> AUC=0.803 ( 612 epochs)   size27=> AUC=0.770 ( 572 epochs)   size38=> AUC=0.826 ( 660 epochs)   size49=> AUC=0.831 ( 515 epochs)   size60=> AUC=0.821 ( 839 epochs)   
	3 HL(s): size6=> AUC=0.687 ( 627 epochs)   size16=> AUC=0.805 (1382 epochs)   size27=> AUC=0.789 ( 924 epochs)   size38=> AUC=0.815 (1392 epochs)   size49=> AUC=0.847 ( 947 epochs)   size60=> AUC=0.830 ( 934 epochs)   
	4 HL(s): size6=> AUC=0.499 ( 206 epochs)   size16=> AUC=0.808 (1578 epochs)   size27=> AUC=0.814 (1155 epochs)   size38=> AUC=0.801 ( 905 epochs)   size49=> AUC=0.804 ( 824 epochs)   size60=> AUC=0.801 ( 654 epochs)   
Fold 3: 01:04:45 AM  



XGB AUC=0.903	NNs:
RF AUC=0.900
NNs:
	1 HL(s): size6=> AUC=0.758 ( 761 epochs)   size16=> AUC=0.864 ( 848 epochs)   size27=> AUC=0.851 (1120 epochs)   size38=> AUC=0.869 ( 613 epochs)   size49=> AUC=0.870 ( 823 epochs)   size60=> AUC=0.864 (1070 epochs)   
	2 HL(s): size6=> AUC=0.663 ( 347 epochs)   size16=> AUC=0.806 (1225 epochs)   size27=> AUC=0.852 ( 610 epochs)   size38=> AUC=0.848 ( 564 epochs)   size49=> AUC=0.861 ( 781 epochs)   size60=> AUC=0.862 ( 753 epochs)   
	3 HL(s): size6=> AUC=0.502 ( 208 epochs)   size16=> AUC=0.785 ( 708 epochs)   size27=> AUC=0.824 ( 732 epochs)   size38=> AUC=0.860 (1113 epochs)   size49=> AUC=0.868 (1144 epochs)   size60=> AUC=0.833 ( 537 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 201 epochs)   size16=> AUC=0.773 ( 701 epochs)   size27=> AUC=0.797 ( 642 epochs)   size38=> AUC=0.805 ( 474 epochs)   size49=> AUC=0.849 ( 510 epochs)   size60=> AUC=0.794 (1039 epochs)   
Fold 4: 04:15:07 AM  



XGB AUC=0.907	NNs:
RF AUC=0.941
NNs:
	1 HL(s): size6=> AUC=0.771 ( 437 epochs)   size16=> AUC=0.821 (1002 epochs)   size27=> AUC=0.832 ( 711 epochs)   size38=> AUC=0.875 ( 803 epochs)   size49=> AUC=0.854 ( 730 epochs)   size60=> AUC=0.862 ( 984 epochs)   
	2 HL(s): size6=> AUC=0.745 ( 390 epochs)   size16=> AUC=0.803 ( 707 epochs)   size27=> AUC=0.821 ( 612 epochs)   size38=> AUC=0.844 ( 764 epochs)   size49=> AUC=0.875 (1065 epochs)   size60=> AUC=0.863 ( 759 epochs)   
	3 HL(s): size6=> AUC=0.494 ( 208 epochs)   size16=> AUC=0.731 ( 355 epochs)   size27=> AUC=0.817 ( 825 epochs)   size38=> AUC=0.792 ( 490 epochs)   size49=> AUC=0.858 ( 974 epochs)   size60=> AUC=0.828 ( 495 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 201 epochs)   size16=> AUC=0.756 ( 580 epochs)   size27=> AUC=0.797 ( 772 epochs)   size38=> AUC=0.783 ( 456 epochs)   size49=> AUC=0.846 (1047 epochs)   size60=> AUC=0.823 ( 891 epochs)   


## SMOTE and Random Undersampling on each  K-fold Group Training data

In [14]:
auc_df = pd.DataFrame(index=range(5))

for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    #---------------------------------------------------------------------------
    train_data, train_labels = oversample.fit_resample(train_data, train_labels)
    train_data, train_labels = ran_undersample.fit_resample(train_data, train_labels)
    #---------------------------------------------------------------------------
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 04:47:59 PM  



XGB AUC=0.692	NNs:
RF AUC=0.710
NNs:
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
size6=> AUC=0.680 ( 784 epochs)   size16=> AUC=0.594 ( 289 epochs)   size27=> AUC=0.643 ( 273 epochs)   size38=> AUC=0.627 ( 253 epochs)   size49=> AUC=0.617 ( 281 epochs)   size60=> AUC=0.607 ( 225 epochs)   
	2 HL(s): size6=> AUC=0.579 ( 206 epochs)   size16=> AUC=0.652 ( 404 epochs)   size27=> AUC=0.642 ( 293 epochs)   size38=> AUC=0.636 ( 256 epochs)   size49=> AUC=0.597 ( 267 epochs)   size60=> AUC=0.560 ( 262 epochs)   
	3 HL(s): size6=> AUC=0.637 ( 499 epochs)   size16=> AUC=0.602 ( 380 epochs)   size27=> AUC=0.562 ( 446 epochs)   size38=> AUC=0.592 ( 293 epochs)   size49=> AUC=0.627 ( 298 epochs)   size60=> AUC=0.654 ( 323 epochs)   
	4 HL(s): size6=> AUC=0.608 ( 512 epochs)   size16=> AUC=0.646 ( 571 epochs)   size27=> AUC=0.602 ( 3



XGB AUC=0.740	NNs:
RF AUC=0.766
NNs:
	1 HL(s): size6=> AUC=0.689 ( 262 epochs)   size16=> AUC=0.688 ( 247 epochs)   size27=> AUC=0.695 ( 293 epochs)   size38=> AUC=0.708 ( 788 epochs)   size49=> AUC=0.675 ( 209 epochs)   size60=> AUC=0.699 ( 244 epochs)   
	2 HL(s): size6=> AUC=0.633 ( 349 epochs)   size16=> AUC=0.643 ( 468 epochs)   size27=> AUC=0.683 ( 298 epochs)   size38=> AUC=0.654 ( 268 epochs)   size49=> AUC=0.694 ( 230 epochs)   size60=> AUC=0.662 ( 274 epochs)   
	3 HL(s): size6=> AUC=0.741 ( 429 epochs)   size16=> AUC=0.639 ( 308 epochs)   size27=> AUC=0.669 ( 313 epochs)   size38=> AUC=0.625 ( 265 epochs)   size49=> AUC=0.699 ( 282 epochs)   size60=> AUC=0.615 ( 297 epochs)   
	4 HL(s): size6=> AUC=0.514 ( 207 epochs)   size16=> AUC=0.626 ( 347 epochs)   size27=> AUC=0.595 ( 288 epochs)   size38=> AUC=0.692 ( 361 epochs)   size49=> AUC=0.669 ( 327 epochs)   size60=> AUC=0.643 ( 301 epochs)   




XGB AUC=0.724	NNs:
RF AUC=0.721
NNs:
	1 HL(s): size6=> AUC=0.667 ( 407 epochs)   size16=> AUC=0.673 ( 748 epochs)   size27=> AUC=0.684 ( 690 epochs)   size38=> AUC=0.713 ( 302 epochs)   size49=> AUC=0.684 ( 311 epochs)   size60=> AUC=0.689 ( 240 epochs)   
	2 HL(s): size6=> AUC=0.623 ( 345 epochs)   size16=> AUC=0.630 ( 334 epochs)   size27=> AUC=0.658 ( 303 epochs)   size38=> AUC=0.666 ( 291 epochs)   size49=> AUC=0.679 ( 342 epochs)   size60=> AUC=0.669 ( 271 epochs)   
	3 HL(s): size6=> AUC=0.614 ( 421 epochs)   size16=> AUC=0.678 ( 407 epochs)   size27=> AUC=0.623 ( 301 epochs)   size38=> AUC=0.644 ( 290 epochs)   size49=> AUC=0.628 ( 273 epochs)   size60=> AUC=0.636 ( 301 epochs)   
	4 HL(s): size6=> AUC=0.682 ( 756 epochs)   size16=> AUC=0.630 ( 379 epochs)   size27=> AUC=0.644 ( 330 epochs)   size38=> AUC=0.600 ( 304 epochs)   size49=> AUC=0.626 ( 321 epochs)   size60=> AUC=0.628 ( 315 epochs)   




XGB AUC=0.694	NNs:
RF AUC=0.661
NNs:
	1 HL(s): size6=> AUC=0.615 ( 347 epochs)   size16=> AUC=0.610 ( 391 epochs)   size27=> AUC=0.617 ( 284 epochs)   size38=> AUC=0.617 ( 208 epochs)   size49=> AUC=0.625 ( 252 epochs)   size60=> AUC=0.649 ( 225 epochs)   
	2 HL(s): size6=> AUC=0.521 ( 379 epochs)   size16=> AUC=0.654 ( 276 epochs)   size27=> AUC=0.618 ( 298 epochs)   size38=> AUC=0.584 ( 280 epochs)   size49=> AUC=0.620 ( 286 epochs)   size60=> AUC=0.606 ( 258 epochs)   
	3 HL(s): size6=> AUC=0.612 ( 414 epochs)   size16=> AUC=0.616 ( 393 epochs)   size27=> AUC=0.657 ( 310 epochs)   size38=> AUC=0.575 ( 297 epochs)   size49=> AUC=0.610 ( 280 epochs)   size60=> AUC=0.627 ( 245 epochs)   
	4 HL(s): size6=> AUC=0.642 ( 541 epochs)   size16=> AUC=0.606 ( 468 epochs)   size27=> AUC=0.621 ( 336 epochs)   size38=> AUC=0.558 ( 333 epochs)   size49=> AUC=0.595 ( 300 epochs)   size60=> AUC=0.569 ( 279 epochs)   
Fold 4: 06:03:59 PM  



XGB AUC=0.677	NNs:
RF AUC=0.638
NNs:
	1 HL(s): size6=> AUC=0.666 (1203 epochs)   size16=> AUC=0.689 ( 255 epochs)   size27=> AUC=0.661 ( 273 epochs)   size38=> AUC=0.664 ( 247 epochs)   size49=> AUC=0.660 ( 373 epochs)   size60=> AUC=0.672 ( 205 epochs)   
	2 HL(s): size6=> AUC=0.497 ( 201 epochs)   size16=> AUC=0.684 ( 497 epochs)   size27=> AUC=0.706 ( 606 epochs)   size38=> AUC=0.642 ( 226 epochs)   size49=> AUC=0.653 ( 354 epochs)   size60=> AUC=0.686 ( 755 epochs)   
	3 HL(s): size6=> AUC=0.523 ( 231 epochs)   size16=> AUC=0.610 ( 314 epochs)   size27=> AUC=0.638 ( 257 epochs)   size38=> AUC=0.669 ( 299 epochs)   size49=> AUC=0.653 ( 284 epochs)   size60=> AUC=0.651 ( 249 epochs)   
	4 HL(s): size6=> AUC=0.609 ( 465 epochs)   size16=> AUC=0.610 ( 465 epochs)   size27=> AUC=0.627 ( 287 epochs)   size38=> AUC=0.621 ( 320 epochs)   size49=> AUC=0.614 ( 296 epochs)   size60=> AUC=0.674 ( 373 epochs)   


## SMOTE on each K-Fold Group Training and Testing data

In [15]:
auc_df = pd.DataFrame(index=range(5))

for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
    #---------------------------------------------------------------------------
    train_data, train_labels = oversample.fit_resample(train_data, train_labels)
    test_data, test_labels = oversample.fit_resample(test_data, test_labels)
    #---------------------------------------------------------------------------
    
    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 06:35:41 PM  



XGB AUC=0.863	NNs:
RF AUC=0.873
NNs:
	1 HL(s): size6=> AUC=0.675 ( 634 epochs)   size16=> AUC=0.692 ( 498 epochs)   size27=> AUC=0.683 ( 234 epochs)   size38=> AUC=0.694 ( 241 epochs)   size49=> AUC=0.686 ( 245 epochs)   size60=> AUC=0.633 ( 253 epochs)   
	2 HL(s): size6=> AUC=0.500 ( 247 epochs)   size16=> AUC=0.679 ( 436 epochs)   size27=> AUC=0.658 ( 356 epochs)   size38=> AUC=0.680 ( 263 epochs)   size49=> AUC=0.690 ( 283 epochs)   size60=> AUC=0.658 ( 265 epochs)   
	3 HL(s): size6=> AUC=0.631 ( 630 epochs)   size16=> AUC=0.639 ( 450 epochs)   size27=> AUC=0.730 ( 549 epochs)   size38=> AUC=0.648 ( 278 epochs)   size49=> AUC=0.693 ( 323 epochs)   size60=> AUC=0.630 ( 315 epochs)   
	4 HL(s): size6=> AUC=0.509 ( 202 epochs)   size16=> AUC=0.673 ( 634 epochs)   size27=> AUC=0.654 ( 355 epochs)   size38=> AUC=0.643 ( 288 epochs)   size49=> AUC=0.693 ( 325 epochs)   size60=> AUC=0.597 ( 284 epochs)   
Fold 1: 07:11:22 PM  



XGB AUC=0.874	NNs:
RF AUC=0.891
NNs:
	1 HL(s): size6=> AUC=0.710 ( 274 epochs)   size16=> AUC=0.708 ( 253 epochs)   size27=> AUC=0.729 ( 425 epochs)   size38=> AUC=0.718 ( 404 epochs)   size49=> AUC=0.739 ( 206 epochs)   size60=> AUC=0.723 ( 390 epochs)   
	2 HL(s): size6=> AUC=0.701 ( 522 epochs)   size16=> AUC=0.689 ( 280 epochs)   size27=> AUC=0.713 ( 314 epochs)   size38=> AUC=0.709 ( 285 epochs)   size49=> AUC=0.722 ( 271 epochs)   size60=> AUC=0.718 ( 278 epochs)   
	3 HL(s): size6=> AUC=0.672 ( 425 epochs)   size16=> AUC=0.731 ( 361 epochs)   size27=> AUC=0.686 ( 333 epochs)   size38=> AUC=0.719 ( 257 epochs)   size49=> AUC=0.695 ( 494 epochs)   size60=> AUC=0.656 ( 252 epochs)   
	4 HL(s): size6=> AUC=0.501 ( 202 epochs)   size16=> AUC=0.689 ( 470 epochs)   size27=> AUC=0.705 ( 439 epochs)   size38=> AUC=0.699 ( 352 epochs)   size49=> AUC=0.691 ( 350 epochs)   size60=> AUC=0.687 ( 314 epochs)   
Fold 2: 07:50:36 PM  



XGB AUC=0.871	NNs:
RF AUC=0.885
NNs:
	1 HL(s): size6=> AUC=0.694 ( 449 epochs)   size16=> AUC=0.748 ( 567 epochs)   size27=> AUC=0.734 ( 286 epochs)   size38=> AUC=0.721 ( 266 epochs)   size49=> AUC=0.761 ( 306 epochs)   size60=> AUC=0.743 ( 475 epochs)   
	2 HL(s): size6=> AUC=0.721 ( 371 epochs)   size16=> AUC=0.712 ( 421 epochs)   size27=> AUC=0.729 ( 294 epochs)   size38=> AUC=0.711 ( 276 epochs)   size49=> AUC=0.734 ( 548 epochs)   size60=> AUC=0.734 ( 292 epochs)   
	3 HL(s): size6=> AUC=0.599 ( 861 epochs)   size16=> AUC=0.718 ( 361 epochs)   size27=> AUC=0.710 ( 304 epochs)   size38=> AUC=0.690 ( 304 epochs)   size49=> AUC=0.715 ( 331 epochs)   size60=> AUC=0.730 ( 297 epochs)   
	4 HL(s): size6=> AUC=0.696 ( 786 epochs)   size16=> AUC=0.499 ( 204 epochs)   size27=> AUC=0.719 ( 422 epochs)   size38=> AUC=0.645 ( 304 epochs)   size49=> AUC=0.702 ( 330 epochs)   size60=> AUC=0.736 ( 325 epochs)   
Fold 3: 08:37:20 PM  



XGB AUC=0.852	NNs:
RF AUC=0.859
NNs:
	1 HL(s): size6=> AUC=0.624 ( 337 epochs)   size16=> AUC=0.648 ( 295 epochs)   size27=> AUC=0.659 ( 393 epochs)   size38=> AUC=0.671 ( 526 epochs)   size49=> AUC=0.630 ( 343 epochs)   size60=> AUC=0.652 ( 266 epochs)   
	2 HL(s): size6=> AUC=0.650 ( 536 epochs)   size16=> AUC=0.626 ( 393 epochs)   size27=> AUC=0.660 ( 530 epochs)   size38=> AUC=0.630 ( 311 epochs)   size49=> AUC=0.628 ( 288 epochs)   size60=> AUC=0.632 ( 269 epochs)   
	3 HL(s): size6=> AUC=0.625 ( 364 epochs)   size16=> AUC=0.683 ( 423 epochs)   size27=> AUC=0.645 ( 391 epochs)   size38=> AUC=0.609 ( 314 epochs)   size49=> AUC=0.667 ( 373 epochs)   size60=> AUC=0.617 ( 426 epochs)   
	4 HL(s): size6=> AUC=0.498 ( 201 epochs)   size16=> AUC=0.634 ( 456 epochs)   size27=> AUC=0.643 ( 553 epochs)   size38=> AUC=0.631 ( 374 epochs)   size49=> AUC=0.660 ( 449 epochs)   size60=> AUC=0.629 ( 291 epochs)   
Fold 4: 09:30:04 PM  



XGB AUC=0.864	NNs:
RF AUC=0.849
NNs:
	1 HL(s): size6=> AUC=0.620 ( 289 epochs)   size16=> AUC=0.727 ( 430 epochs)   size27=> AUC=0.717 ( 322 epochs)   size38=> AUC=0.736 ( 511 epochs)   size49=> AUC=0.726 (1201 epochs)   size60=> AUC=0.696 ( 242 epochs)   
	2 HL(s): size6=> AUC=0.500 ( 202 epochs)   size16=> AUC=0.691 ( 299 epochs)   size27=> AUC=0.698 ( 251 epochs)   size38=> AUC=0.693 ( 273 epochs)   size49=> AUC=0.730 ( 364 epochs)   size60=> AUC=0.742 ( 731 epochs)   
	3 HL(s): size6=> AUC=0.634 ( 710 epochs)   size16=> AUC=0.706 ( 518 epochs)   size27=> AUC=0.687 ( 283 epochs)   size38=> AUC=0.732 ( 375 epochs)   size49=> AUC=0.689 ( 291 epochs)   size60=> AUC=0.745 ( 562 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 239 epochs)   size16=> AUC=0.679 ( 326 epochs)   size27=> AUC=0.669 ( 369 epochs)   size38=> AUC=0.701 ( 297 epochs)   size49=> AUC=0.735 ( 651 epochs)   size60=> AUC=0.675 ( 269 epochs)   


## SMOTE and Random Under Sampling in combo on k-fold groups Training and Testing data

In [16]:
auc_df = pd.DataFrame(index=range(5))

for i, (train, test) in enumerate(skf.split(df, labels)):
    print(f'Fold {i}: {datetime.datetime.now().strftime("%I:%M:%S %p")}  ', end='')
    train_data = df.iloc[train].copy()
    test_data = df.iloc[test].copy()
    
    train_labels=labels.iloc[train].copy()
    test_labels=labels.iloc[test].copy()
    
    weights = len(train_labels)/test_labels.sum()
    glm_weights = pd.Series(data=1, index=train_labels.index)
    glm_weights.loc[train_labels==1] = weights
    
    #remove outcomes
    train_id = train_data['ID'].copy()
    test_id = test_data['ID'].copy()
    
    train_data.drop(columns=outcome_cols+['ID'], inplace=True)
    test_data.drop(columns= outcome_cols+['ID'], inplace=True)
    
    #print(f'Fold {i} Imputation')
    #imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    #train_data.values = imp.fit_transform(train_data)
    #test_data.values  = imp.transform(test_data)
    test_data = test_data.fillna(train_data.mean())
    train_data = train_data.fillna(train_data.mean())
    
   #---------------------------------------------------------------------------
    train_data, train_labels = oversample.fit_resample(train_data, train_labels)
    train_data, train_labels = ran_undersample.fit_resample(train_data, train_labels)
    test_data, test_labels = oversample.fit_resample(test_data, test_labels)
    test_data, test_labels = ran_undersample.fit_resample(test_data, test_labels)
    #---------------------------------------------------------------------------

    sd_0_cols = train_data.columns[(train_data.std() == 0)]
    train_data.drop(columns=sd_0_cols, inplace=True)
    test_data.drop(columns=sd_0_cols, inplace=True)
    
    cols_to_scale = [foo for foo in con_cat_cols + contin_cols if foo in train_data.columns]
    scaler = StandardScaler()
    train_data.loc[:,cols_to_scale] = scaler.fit_transform(train_data.loc[:,cols_to_scale])
    test_data.loc[:,cols_to_scale]  = scaler.transform(test_data.loc[:,cols_to_scale])
    
    xgb = XGBClassifier()
    xgb.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, xgb.predict_proba(test_data)[:,1])
    print(f'XGB AUC={auc:.3f}\tNNs:')
    if i == 0:
        auc_df['xgb'] = np.nan
    auc_df['xgb'].iloc[i] = auc
    
    rf = RandomForestClassifier()
    rf.fit(train_data, train_labels)
    auc = roc_auc_score(test_labels, rf.predict_proba(test_data)[:,1])
    print(f'RF AUC={auc:.3f}\nNNs:')
    if i == 0:
        auc_df['rf'] = np.nan
    auc_df['rf'].iloc[i] = auc
    
    for hl_count in [1,2,3,4]:
        print(f'\t{hl_count} HL(s): ', end='')
        for hl_size in np.linspace(6,60,dtype=int, num=6):
            model = make_model(hl_count=hl_count, hl_size=hl_size)
            name = f'{hl_count}_{hl_size}'
            if name not in auc_df.columns:
                auc_df[name] = np.nan
            
            baseline_history = model.fit(
                train_data.values,
                train_labels.values,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                callbacks = [early_stopping],
                validation_data=(test_data.values, test_labels.values),
                verbose=0
            )
            auc = np.mean(baseline_history.history['val_auc'][-100:])
            run_length = len(baseline_history.history['val_auc'])
            auc_df[name].iloc[i] = auc
            print(f'size{hl_size}=> AUC={auc:.3f} ({str(run_length).rjust(4)} epochs)   ', end='')
        print()
            
        
    #pos = train_labels.sum()
    #total = len(train_labels)
    #neg = total - pos

    

Fold 0: 10:30:21 PM  



XGB AUC=0.874	NNs:
RF AUC=0.880
NNs:
	1 HL(s): size6=> AUC=0.693 ( 346 epochs)   size16=> AUC=0.712 ( 539 epochs)   size27=> AUC=0.720 ( 281 epochs)   size38=> AUC=0.678 ( 259 epochs)   size49=> AUC=0.695 ( 271 epochs)   size60=> AUC=0.670 ( 213 epochs)   
	2 HL(s): size6=> AUC=0.654 ( 344 epochs)   size16=> AUC=0.697 ( 351 epochs)   size27=> AUC=0.702 ( 600 epochs)   size38=> AUC=0.701 ( 316 epochs)   size49=> AUC=0.674 ( 267 epochs)   size60=> AUC=0.696 ( 291 epochs)   
	3 HL(s): size6=> AUC=0.631 ( 428 epochs)   size16=> AUC=0.685 ( 484 epochs)   size27=> AUC=0.700 ( 480 epochs)   size38=> AUC=0.677 ( 696 epochs)   size49=> AUC=0.691 ( 329 epochs)   size60=> AUC=0.657 ( 294 epochs)   
	4 HL(s): size6=> AUC=0.665 ( 369 epochs)   size16=> AUC=0.725 (1014 epochs)   size27=> AUC=0.698 ( 628 epochs)   size38=> AUC=0.645 ( 367 epochs)   size49=> AUC=0.658 ( 313 epochs)   size60=> AUC=0.652 ( 294 epochs)   
Fold 1: 11:39:46 PM  



XGB AUC=0.882	NNs:
RF AUC=0.885
NNs:
	1 HL(s): size6=> AUC=0.712 ( 354 epochs)   size16=> AUC=0.725 ( 619 epochs)   size27=> AUC=0.704 ( 254 epochs)   size38=> AUC=0.714 ( 265 epochs)   size49=> AUC=0.694 ( 243 epochs)   size60=> AUC=0.698 ( 258 epochs)   
	2 HL(s): size6=> AUC=0.703 ( 387 epochs)   size16=> AUC=0.718 ( 291 epochs)   size27=> AUC=0.680 ( 317 epochs)   size38=> AUC=0.693 ( 310 epochs)   size49=> AUC=0.704 ( 302 epochs)   size60=> AUC=0.715 ( 354 epochs)   
	3 HL(s): size6=> AUC=0.686 ( 498 epochs)   size16=> AUC=0.665 ( 344 epochs)   size27=> AUC=0.679 ( 318 epochs)   size38=> AUC=0.681 ( 319 epochs)   size49=> AUC=0.674 ( 293 epochs)   size60=> AUC=0.694 ( 292 epochs)   
	4 HL(s): size6=> AUC=0.635 ( 357 epochs)   size16=> AUC=0.675 ( 453 epochs)   size27=> AUC=0.721 ( 412 epochs)   size38=> AUC=0.687 ( 329 epochs)   size49=> AUC=0.650 ( 308 epochs)   size60=> AUC=0.683 ( 301 epochs)   
Fold 2: 12:47:25 AM  



XGB AUC=0.887	NNs:
RF AUC=0.871
NNs:
	1 HL(s): size6=> AUC=0.721 ( 492 epochs)   size16=> AUC=0.747 ( 438 epochs)   size27=> AUC=0.739 ( 343 epochs)   size38=> AUC=0.734 ( 477 epochs)   size49=> AUC=0.730 ( 378 epochs)   size60=> AUC=0.714 ( 232 epochs)   
	2 HL(s): size6=> AUC=0.684 ( 444 epochs)   size16=> AUC=0.645 ( 325 epochs)   size27=> AUC=0.722 ( 298 epochs)   size38=> AUC=0.751 ( 333 epochs)   size49=> AUC=0.740 ( 283 epochs)   size60=> AUC=0.719 ( 359 epochs)   
	3 HL(s): size6=> AUC=0.683 ( 461 epochs)   size16=> AUC=0.710 ( 418 epochs)   size27=> AUC=0.704 ( 372 epochs)   size38=> AUC=0.698 ( 357 epochs)   size49=> AUC=0.737 ( 358 epochs)   size60=> AUC=0.672 ( 255 epochs)   
	4 HL(s): size6=> AUC=0.697 (1281 epochs)   size16=> AUC=0.681 ( 385 epochs)   size27=> AUC=0.652 ( 346 epochs)   size38=> AUC=0.716 ( 842 epochs)   size49=> AUC=0.727 ( 345 epochs)   size60=> AUC=0.730 ( 401 epochs)   
Fold 3: 02:06:33 AM  



XGB AUC=0.848	NNs:
RF AUC=0.864
NNs:
	1 HL(s): size6=> AUC=0.607 ( 277 epochs)   size16=> AUC=0.632 ( 264 epochs)   size27=> AUC=0.648 ( 312 epochs)   size38=> AUC=0.639 ( 308 epochs)   size49=> AUC=0.610 ( 314 epochs)   size60=> AUC=0.667 ( 245 epochs)   
	2 HL(s): size6=> AUC=0.659 ( 319 epochs)   size16=> AUC=0.619 ( 340 epochs)   size27=> AUC=0.649 ( 328 epochs)   size38=> AUC=0.639 ( 222 epochs)   size49=> AUC=0.635 ( 362 epochs)   size60=> AUC=0.604 ( 300 epochs)   
	3 HL(s): size6=> AUC=0.595 ( 533 epochs)   size16=> AUC=0.642 ( 483 epochs)   size27=> AUC=0.615 ( 353 epochs)   size38=> AUC=0.614 ( 339 epochs)   size49=> AUC=0.634 ( 309 epochs)   size60=> AUC=0.642 ( 524 epochs)   
	4 HL(s): size6=> AUC=0.500 ( 201 epochs)   size16=> AUC=0.610 ( 314 epochs)   size27=> AUC=0.633 ( 317 epochs)   size38=> AUC=0.632 ( 326 epochs)   size49=> AUC=0.658 ( 443 epochs)   size60=> AUC=0.630 ( 320 epochs)   
Fold 4: 03:21:28 AM  



XGB AUC=0.868	NNs:
RF AUC=0.853
NNs:
	1 HL(s): size6=> AUC=0.675 ( 296 epochs)   size16=> AUC=0.703 ( 327 epochs)   size27=> AUC=0.713 ( 343 epochs)   size38=> AUC=0.683 ( 309 epochs)   size49=> AUC=0.719 ( 777 epochs)   size60=> AUC=0.709 ( 469 epochs)   
	2 HL(s): size6=> AUC=0.510 ( 201 epochs)   size16=> AUC=0.695 ( 442 epochs)   size27=> AUC=0.689 ( 348 epochs)   size38=> AUC=0.712 ( 357 epochs)   size49=> AUC=0.705 ( 261 epochs)   size60=> AUC=0.725 ( 601 epochs)   
	3 HL(s): size6=> AUC=0.670 ( 342 epochs)   size16=> AUC=0.694 ( 513 epochs)   size27=> AUC=0.665 ( 383 epochs)   size38=> AUC=0.733 ( 287 epochs)   size49=> AUC=0.699 ( 284 epochs)   size60=> AUC=0.694 ( 353 epochs)   
	4 HL(s): size6=> AUC=0.546 ( 207 epochs)   size16=> AUC=0.651 ( 339 epochs)   size27=> AUC=0.669 ( 306 epochs)   size38=> AUC=0.664 ( 380 epochs)   size49=> AUC=0.681 ( 318 epochs)   size60=> AUC=0.696 ( 322 epochs)   
