In [69]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [70]:
input_path = 'H:\RediMinds\VCQI'
train = pd.read_csv(input_path+"\VCQI_clean_train.csv")
test = pd.read_csv(input_path+"\VCQI_clean_test.csv")

In [71]:
x_train = train.drop(labels=['INTRA_OP_COMPLICATIONS','PATIENTNUMBER'], axis = 'columns').copy()
y_train = train['INTRA_OP_COMPLICATIONS'].copy()
x_test = test.drop(labels=['INTRA_OP_COMPLICATIONS','PATIENTNUMBER'], axis = 'columns').copy()
y_test = test['INTRA_OP_COMPLICATIONS'].copy() 

In [72]:
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

% pos labels train 0.06
% pos labels test 0.06


In [73]:
x_test

Unnamed: 0,CENTERCODE,GENDER,AGEATSURGERY,MARITALSTATUS,RACE,EDUCATION,BMI,CLINICALSIZEmm,CHARLSONSCORE,SYMPTOMS,...,R.E.N.A.L.NEPHRORISKSTRATIFICATION,RADIUSmaximaldiameterincm,NEARNESSOFTUMOUR,ANTERIORORPOSTERIOR,LOCATIONTOPOLARLINE,ASASCORE,PARTIALNEPHROINDICATION,MULTIFOCALITY,NOOFLESIONS,center_volume
0,5,1,61.0,1,0,4,38.7600,50.0,2.0,1,...,2,0,0,0,2,2,1,1,2.0,159.0
1,7,1,33.0,1,3,0,27.1700,60.0,2.0,1,...,3,0,0,3,2,1,1,1,1.0,27.0
2,10,1,41.0,1,3,2,24.2000,21.0,0.0,0,...,2,1,1,0,2,1,1,1,1.0,21.0
3,17,1,35.0,3,1,3,27.2700,34.0,0.0,0,...,1,1,2,0,0,0,1,1,1.0,48.0
4,16,1,56.0,1,1,3,27.9200,25.0,0.0,0,...,1,1,1,3,0,0,1,2,1.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,7,1,50.0,1,3,0,22.9400,25.0,2.0,0,...,1,1,1,3,0,0,1,1,1.0,29.0
503,5,1,34.0,3,4,4,21.2000,63.0,1.0,0,...,4,0,3,1,1,0,1,1,1.0,339.0
504,7,0,51.0,1,3,4,21.9000,29.0,2.0,0,...,2,1,0,3,0,0,1,1,1.0,110.0
505,5,1,56.0,3,4,2,22.8695,38.0,1.0,1,...,4,1,0,0,2,2,1,1,1.0,140.0


### One Hot Encoding Cataegorical Data

In [74]:
# ONE HOT CODE data for training

# Create dummy variables
with open (input_path+'\cat_col', 'rb') as fp:
    cat_col = pickle.load(fp)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

one_hot_encoded_array = encoder.fit_transform(x_train[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_train_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_train = x_train.merge(x_train_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_train = x_train.drop(labels = cat_col, axis = 'columns') # drop original variables

In [75]:
# Create dummy variables
one_hot_encoded_array = encoder.transform(x_test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_test_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_test = x_test.merge(x_test_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_test = x_test.drop(labels = cat_col, axis = 'columns') # drop original variables

In [76]:
print("Number of records in trainset {}".format(len(x_train)))
print("Number records in testset {}".format(len(x_test)))
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

Number of records in trainset 1183
Number records in testset 507
% pos labels train 0.06
% pos labels test 0.06


### Defining Pipeline

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [46]:
# Pipeline for logist Classifier
# Pipeline for logist Classifier
numeric_features = x_train.select_dtypes('float').columns.tolist()
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())
                                     ])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)], remainder='passthrough')

### Neural Network Classifier

In [47]:
# compute weight to account for class imbalance
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight(class_weight='balanced', classes=y_train.unique(), y = y_train)
weights
class_weight = {0: weights[0] , 1: weights[1]}
print(class_weight)

{0: 0.5300179211469535, 1: 8.828358208955224}


In [57]:
def nn_model(dropout_rate, neurons, learning_rate):
    from numpy.random import seed
    seed(123)
    tf.random.set_seed(123)
    tf.keras.backend.clear_session()
   
    # input layer
    input_layer = keras.layers.Input(shape=(x_train.shape[1],), name = "input_layer")
    x = keras.layers.Dense(neurons, name = 'Dense_1',activation='relu')(input_layer)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_1', seed = 42)(x)
    x = keras.layers.Dense(neurons, name = 'Dense_2',activation='relu')(x)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_2', seed = 42)(x)
    main_output = keras.layers.Dense(1, activation='sigmoid',name='main_output')(x)

    model = keras.Model(inputs= input_layer, outputs=main_output)

    # compiling the model
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(curve = 'ROC',name = 'AUC_ROC'),
                           tf.keras.metrics.AUC(curve = 'PR', name = 'AUC_PR')],
                  )

    # Keras callback. The patience parameter is the amount of epochs to check for improvement
    return model

In [58]:
np.random.seed(1)
tf.random.set_seed(1)
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=nn_model, verbose=0)
# grid search epochs, batch size and optimizer
parameter_dist = {'classifier__dropout_rate':[0.1,0.2,0.3,0.4,0.5],
                  'classifier__epochs':[10,20,30],
                  'classifier__neurons':[128, 256],
                  'classifier__learning_rate': [0.01, 0.001, 0.0001],
                 }

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
grid = GridSearchCV(clf,parameter_dist,n_jobs= 1,scoring= 'average_precision', cv = 10, verbose=1)

In [59]:
%time
#early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min',restore_best_weights=True)
grid.fit(x_train,y_train, classifier__class_weight = class_weight)

Wall time: 0 ns
Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The las

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.

ValueError

KeyboardInterrupt: 

In [14]:
results_nn = pd.DataFrame(grid.predict(x_test), columns=['pred_label'])
results_nn['pred_prob'] =  pd.DataFrame(grid.predict_proba(x_test))[1]
results_nn['true_label'] = np.array(y_test)

In [15]:
results_nn

Unnamed: 0,pred_label,pred_prob,true_label
0,0,0.305865,0
1,0,0.182435,0
2,1,0.511340,0
3,0,0.045018,0
4,0,0.269878,0
...,...,...,...
502,0,0.097726,0
503,0,0.127194,0
504,0,0.158327,0
505,0,0.128861,0


In [16]:
# NeuralNetwork Score Raw Data
print(grid.best_params_)
print("\n Model Best CV score: \n" + str(grid.best_score_))
print("\n Confusion Matrix : \n"+str(metrics.confusion_matrix(results_nn['true_label'], results_nn['pred_label'])))
print("\n Classification Report: \n"+ str(metrics.classification_report(results_nn['true_label'], results_nn['pred_label'])))
print("\n AUC-ROC: \n"+ str(metrics.roc_auc_score(results_nn['true_label'], results_nn['pred_prob'])))


def calc_aucpr_data(result):
    y_ACTUAL = result['true_label']
    scores_prob = result['pred_prob']
    yhat = result['pred_label']
    precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
    prc_auc = metrics.auc(recall,precision)
    return prc_auc

print("\n PR-ROC: \n"+ str(calc_aucpr_data(results_nn)))

{'classifier__dropout_rate': 0.2, 'classifier__epochs': 10, 'classifier__learning_rate': 0.0001, 'classifier__neurons': 256}

 Model Best CV score: 
0.43557357159230425

 Confusion Matrix : 
[[398  81]
 [ 10  18]]

 Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       479
           1       0.18      0.64      0.28        28

    accuracy                           0.82       507
   macro avg       0.58      0.74      0.59       507
weighted avg       0.93      0.82      0.86       507


 AUC-ROC: 
0.8555025350432448

 PR-ROC: 
0.38264200165036266


In [17]:
def bootstrapped_AUC(result):
    from sklearn.utils import resample
    #from tqdm import tqdm

    n_iter = 10000
    roc_auc = list()
    prc_auc = list()


    for i in range(n_iter):
        result_sample = resample(result, n_samples = len(result),random_state=i)
        
        #Calculating AUROC for each sample
        y_ACTUAL= result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        fpr, tpr, thresholds = metrics.roc_curve(y_ACTUAL, scores_prob, pos_label=1)
        roc_auc.append(metrics.auc(fpr, tpr))

        #calculate AUPRC for each sample
        y_ACTUAL = result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        yhat = result_sample['pred_label']
        precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
        prc_auc.append(metrics.auc(recall,precision))
    
    return roc_auc, prc_auc

In [18]:
roc_auc_nn, pr_auc_nn = bootstrapped_AUC(results_nn)

In [19]:
dict = {'roc_auc_nn': roc_auc_nn,
        'pr_auc_nn': pr_auc_nn,
       }
pd.DataFrame(dict).describe(percentiles=[0.025,0.975])

Unnamed: 0,roc_auc_nn,pr_auc_nn
count,10000.0,10000.0
mean,0.855882,0.398036
std,0.036822,0.100997
min,0.699202,0.087764
2.5%,0.778824,0.211746
50%,0.85763,0.392938
97.5%,0.922667,0.605475
max,0.976215,0.77161


### Exporting the Model and Pipeline

In [20]:
output_path = 'output/models/'

In [21]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [22]:
from joblib import dump
# Save the Keras model first:
grid.best_estimator_.named_steps['classifier'].model.save(output_path+'nn_model.h5')


# Finally, save the pipeline:
dump(grid.best_estimator_.named_steps['preprocessor'], output_path+'nn_pipeline.pkl')

['output/models/nn_pipeline.pkl']

In [68]:
from joblib import dump
dump(encoder, output_path+'/OHE.joblib')

['output/models//OHE.joblib']