In [28]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [29]:
input_path = 'H:\RediMinds\VCQI'
train = pd.read_csv(input_path+"\VCQI_clean_train_30_day_complications.csv")
test = pd.read_csv(input_path+"\VCQI_clean_test_30_day_complications.csv")

In [30]:
target = 'POSTOP_COMPLICATIONS'
x_train = train.drop(labels=target, axis = 'columns').copy()
y_train = train[target].copy()
x_test = test.drop(labels=target, axis = 'columns').copy()
y_test = test[target].copy() 

In [31]:
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

% pos labels train 0.21
% pos labels test 0.21


### One Hot Encoding Cataegorical Data

In [32]:
# ONE HOT CODE data for training

# Create dummy variables
with open (input_path+'\cat_col_30_day_complications', 'rb') as fp:
    cat_col = pickle.load(fp)


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

one_hot_encoded_array = encoder.fit_transform(x_train[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_train_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_train = x_train.merge(x_train_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_train = x_train.drop(labels = cat_col, axis = 'columns') # drop original variables

In [33]:
# Create dummy variables
one_hot_encoded_array = encoder.transform(x_test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_test_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_test = x_test.merge(x_test_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_test = x_test.drop(labels = cat_col, axis = 'columns') # drop original variables

In [34]:
print("Number of records in trainset {}".format(len(x_train)))
print("Number records in testset {}".format(len(x_test)))
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

Number of records in trainset 984
Number records in testset 422
% pos labels train 0.21
% pos labels test 0.21


### Defining Pipeline

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [36]:
# Pipeline for logist Classifier
numeric_features = x_train.select_dtypes('float').columns.tolist()
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)], remainder='passthrough')

### Neural Network Classifier

In [37]:
# compute weight to account for class imbalance
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight(class_weight='balanced', classes=y_train.unique(), y = y_train)
weights
class_weight = {0: weights[0] , 1: weights[1]}
print(class_weight)

{0: 2.3883495145631066, 1: 0.6323907455012854}


In [38]:
def nn_model(dropout_rate, neurons, learning_rate):
    from numpy.random import seed
    seed(123)
    tf.random.set_seed(123)
    tf.keras.backend.clear_session()
   
    # input layer
    input_layer = keras.layers.Input(shape=(x_train.shape[1],), name = "input_layer")
    x = keras.layers.Dense(neurons, name = 'Dense_1',activation='relu')(input_layer)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_1', seed = 42)(x)
    x = keras.layers.Dense(neurons, name = 'Dense_2',activation='relu')(x)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_2', seed = 42)(x)
    main_output = keras.layers.Dense(1, activation='sigmoid',name='main_output')(x)

    model = keras.Model(inputs= input_layer, outputs=main_output)

    # compiling the model
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(curve = 'ROC',name = 'AUC_ROC'),
                           tf.keras.metrics.AUC(curve = 'PR', name = 'AUC_PR')],
                  )

    # Keras callback. The patience parameter is the amount of epochs to check for improvement
    
    return model

In [39]:
np.random.seed(1)
tf.random.set_seed(1)
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=nn_model, verbose=0)
# grid search epochs, batch size and optimizer
parameter_dist = {'classifier__dropout_rate':[0.1,0.2,0.3,0.4,0.5],
                  'classifier__epochs':[10,20,30],
                  'classifier__neurons':[128, 256],
                  'classifier__learning_rate': [0.01, 0.001, 0.0001],
                 }

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('pca',PCA()),
                      ('classifier', model)])
model = GridSearchCV(clf,parameter_dist,n_jobs= 1,scoring= 'average_precision', cv = 10, verbose=1)

In [40]:
#early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min',restore_best_weights=True)
model.fit(x_train,y_train, classifier__class_weight = class_weight)

Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed: 51.9min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                

In [41]:
results_nn = pd.DataFrame(model.predict(x_test), columns=['pred_label'])
results_nn['pred_prob'] =  pd.DataFrame(model.predict_proba(x_test))[1]
results_nn['true_label'] = np.array(y_test)

In [42]:
results_nn

Unnamed: 0,pred_label,pred_prob,true_label
0,1,0.895386,1
1,0,0.038000,0
2,0,0.003884,0
3,1,0.926017,1
4,0,0.001948,0
...,...,...,...
417,1,0.762174,1
418,0,0.091294,0
419,0,0.005862,1
420,0,0.004995,0


In [43]:
# NeuralNetwork Score Raw Data
print("\n Model Balanced Accuracy: \n" + str(metrics.balanced_accuracy_score(results_nn['true_label'], results_nn['pred_label'])))
print("\n Confusion Matrix : \n"+str(metrics.confusion_matrix(results_nn['true_label'], results_nn['pred_label'])))
print("\n Classification Report: \n"+ str(metrics.classification_report(results_nn['true_label'], results_nn['pred_label'])))
print("\n AUC-ROC: \n"+ str(metrics.roc_auc_score(results_nn['true_label'], results_nn['pred_prob'])))


def calc_aucpr_data(result):
    y_ACTUAL = result['true_label']
    scores_prob = result['pred_prob']
    yhat = result['pred_label']
    precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
    prc_auc = metrics.auc(recall,precision)
    return prc_auc

print("\n PR-ROC: \n"+ str(calc_aucpr_data(results_nn)))


 Model Balanced Accuracy: 
0.6126969666295509

 Confusion Matrix : 
[[322  11]
 [ 66  23]]

 Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       333
           1       0.68      0.26      0.37        89

    accuracy                           0.82       422
   macro avg       0.75      0.61      0.63       422
weighted avg       0.80      0.82      0.78       422


 AUC-ROC: 
0.8337888450248001

 PR-ROC: 
0.5979064652188595


In [44]:
def bootstrapped_AUC(result):
    from sklearn.utils import resample
    #from tqdm import tqdm

    n_iter = 10000
    roc_auc = list()
    prc_auc = list()


    for i in range(n_iter):
        result_sample = resample(result, n_samples = len(result),random_state=i)
        
        #Calculating AUROC for each sample
        y_ACTUAL= result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        fpr, tpr, thresholds = metrics.roc_curve(y_ACTUAL, scores_prob, pos_label=1)
        roc_auc.append(metrics.auc(fpr, tpr))

        #calculate AUPRC for each sample
        y_ACTUAL = result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        yhat = result_sample['pred_label']
        precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
        prc_auc.append(metrics.auc(recall,precision))
    
    return roc_auc, prc_auc

In [45]:
roc_auc_nn, pr_auc_nn = bootstrapped_AUC(results_nn)

In [46]:
dict = {'roc_auc_nn': roc_auc_nn,
        'pr_auc_nn': pr_auc_nn,
       }
pd.DataFrame(dict).describe(percentiles=[0.025,0.975])

Unnamed: 0,roc_auc_nn,pr_auc_nn
count,10000.0,10000.0
mean,0.834086,0.598821
std,0.024385,0.052551
min,0.734825,0.351332
2.5%,0.785063,0.493888
50%,0.834934,0.600067
97.5%,0.879186,0.698902
max,0.910027,0.798171


### Exporting the Model and Pipeline

In [47]:
output_path = 'output/models/'

In [48]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [49]:
from joblib import dump
# Save the Keras model first:
model.best_estimator_.named_steps['classifier'].model.save(output_path+'nn_model.h5')


# Finally, save the pipeline:
dump(model.best_estimator_.named_steps['preprocessor'], output_path+'nn_pipeline.pkl')

['output/models/nn_pipeline.pkl']