In [37]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV

In [38]:
input_path = 'H:\RediMinds\VCQI'
train = pd.read_csv(input_path+"\VCQI_clean_train.csv")
test = pd.read_csv(input_path+"\VCQI_clean_test.csv")

In [39]:
x_train = train.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
y_train = train['INTRA_OP_COMPLICATIONS'].copy()
x_test = test.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
y_test = test['INTRA_OP_COMPLICATIONS'].copy() 

In [40]:
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

% pos labels train 0.05
% pos labels test 0.05


### One Hot Encoding Cataegorical Data

In [41]:
# ONE HOT CODE data for training

# Create dummy variables
with open (input_path+'\cat_col', 'rb') as fp:
    cat_col = pickle.load(fp)


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')

one_hot_encoded_array = encoder.fit_transform(x_train[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_train_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_train = x_train.merge(x_train_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_train = x_train.drop(labels = cat_col, axis = 'columns') # drop original variables

In [42]:
# Create dummy variables
one_hot_encoded_array = encoder.transform(x_test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
x_test_OHE =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
x_test = x_test.merge(x_test_OHE, how = 'left', left_index = True, right_index =True) # create dummy variables
x_test = x_test.drop(labels = cat_col, axis = 'columns') # drop original variables

In [43]:
print("Number of records in trainset {}".format(len(x_train)))
print("Number records in testset {}".format(len(x_test)))
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

Number of records in trainset 1985
Number records in testset 852
% pos labels train 0.05
% pos labels test 0.05


### Defining Pipeline

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [45]:
# Pipeline for logist Classifier
numeric_features = x_train.select_dtypes('float').columns.tolist()
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)], remainder='passthrough')

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None) 
import numpy as np
import pickle

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
input_path = 'H:\RediMinds\VCQI'
train = pd.read_csv(input_path+"\VCQI_clean_train.csv")
test = pd.read_csv(input_path+"\VCQI_clean_test.csv")

In [3]:
# Create dummy variables
with open (input_path+'\cat_col', 'rb') as fp:
    cat_col = pickle.load(fp)


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')#drop = 'first')

nn_train = train.copy()
one_hot_encoded_array = encoder.fit_transform(train[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
nn_train_one =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
nn_train = nn_train.merge(nn_train_one, how = 'left', left_index = True, right_index =True) # create dummy variables
nn_train = nn_train.drop(labels = cat_col, axis = 'columns') # drop original variables

In [4]:
nn_train.head()

Unnamed: 0,AGEATSURGERY,BMI,CLINICALSIZEmm,PREOPHB,PREOPHT,PREOPWBC,PREOPCREAT,PREOPEGFR,NOOFLESIONS,INTRA_OP_COMPLICATIONS,GENDER_0,GENDER_1,GENDER_2,MARITALSTATUS_0,MARITALSTATUS_1,MARITALSTATUS_2,MARITALSTATUS_3,MARITALSTATUS_4,RACE_0,RACE_1,RACE_2,RACE_3,RACE_4,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,ECOG_0,ECOG_1,ECOG_2,ECOG_3,ECOG_4,SYMPTOMS_0,SYMPTOMS_1,SYMPTOMS_2,SYMPTOMS_3,SOLITARYKIDNEY_0,SOLITARYKIDNEY_1,SOLITARYKIDNEY_2,SOLITARYKIDNEY_3,BILATERALITYOFTUMOR_0,BILATERALITYOFTUMOR_1,BILATERALITYOFTUMOR_2,SIDEOFTUMOR_0,SIDEOFTUMOR_1,SIDEOFTUMOR_2,SIDEOFTUMOR_3,SIDEOFSURGERY_0,SIDEOFSURGERY_1,SIDEOFSURGERY_2,SIDEOFSURGERY_3,FACE_0,FACE_1,FACE_2,TUMORlOCATION_0,TUMORlOCATION_1,TUMORlOCATION_2,TUMORlOCATION_3,TUMORlOCATION_4,TUMORlOCATION_5,POLARLOCATION_0,POLARLOCATION_1,POLARLOCATION_2,RIMLOCATION_0,RIMLOCATION_1,RIMLOCATION_2,RENALSINUS_0,RENALSINUS_1,RENALSINUS_2,EXOPHYTICRATE_0,EXOPHYTICRATE_1,EXOPHYTICRATE_2,EXOPHYTICRATE_3,CLINICALSIZEGROUP_0,CLINICALSIZEGROUP_1,CLINICALSIZEGROUP_2,CLINICALSIZEGROUP_3,CT_0,CT_1,CT_2,CT_3,CT_4,CT_5,CT_6,CN_0,CN_1,CN_2,CN_3,CM_0,CM_1,CM_2,CM_3,CM_4,CM_5,RADIUSmaximaldiameterincm_0,RADIUSmaximaldiameterincm_1,RADIUSmaximaldiameterincm_2,RADIUSmaximaldiameterincm_3,EXOPHYTICENDOPHYTICPROPERTIES_0,EXOPHYTICENDOPHYTICPROPERTIES_1,EXOPHYTICENDOPHYTICPROPERTIES_2,EXOPHYTICENDOPHYTICPROPERTIES_3,EXOPHYTICENDOPHYTICPROPERTIES_4,NEARNESSOFTUMOUR_0,NEARNESSOFTUMOUR_1,NEARNESSOFTUMOUR_2,NEARNESSOFTUMOUR_3,ANTERIORORPOSTERIOR_0,ANTERIORORPOSTERIOR_1,ANTERIORORPOSTERIOR_2,ANTERIORORPOSTERIOR_3,LOCATIONTOPOLARLINE_0,LOCATIONTOPOLARLINE_1,LOCATIONTOPOLARLINE_2,LOCATIONTOPOLARLINE_3,PARTIALNEPHROINDICATION_0,PARTIALNEPHROINDICATION_1,PARTIALNEPHROINDICATION_2,PARTIALNEPHROINDICATION_3,MULTIFOCALITY_0,MULTIFOCALITY_1,MULTIFOCALITY_2
0,68.0,28.8044,17.0,11.3,34.9,3000.0,0.96,58.0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,61.0,25.1,33.0,12.3,39.4,5400.0,1.25,58.72,1.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,69.0,24.52,18.0,15.4,43.5,7200.0,0.91,82.61,1.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,73.0,19.6,40.0,13.91,41.56,7205.87,1.13,64.0,1.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,62.0,23.66,35.0,11.9,36.0,5500.0,0.94,60.338832,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
# Create dummy variables
nn_test = test.copy()
one_hot_encoded_array = encoder.transform(test[cat_col]).toarray()
column_name = encoder.get_feature_names(cat_col)
nn_test_one =  pd.DataFrame(one_hot_encoded_array, columns= column_name)
nn_test = nn_test.merge(nn_test_one, how = 'left', left_index = True, right_index =True) # create dummy variables
nn_test = nn_test.drop(labels = cat_col, axis = 'columns') # drop original variables

In [6]:
nn_test.head()

Unnamed: 0,AGEATSURGERY,BMI,CLINICALSIZEmm,PREOPHB,PREOPHT,PREOPWBC,PREOPCREAT,PREOPEGFR,NOOFLESIONS,INTRA_OP_COMPLICATIONS,GENDER_0,GENDER_1,GENDER_2,MARITALSTATUS_0,MARITALSTATUS_1,MARITALSTATUS_2,MARITALSTATUS_3,MARITALSTATUS_4,RACE_0,RACE_1,RACE_2,RACE_3,RACE_4,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,ECOG_0,ECOG_1,ECOG_2,ECOG_3,ECOG_4,SYMPTOMS_0,SYMPTOMS_1,SYMPTOMS_2,SYMPTOMS_3,SOLITARYKIDNEY_0,SOLITARYKIDNEY_1,SOLITARYKIDNEY_2,SOLITARYKIDNEY_3,BILATERALITYOFTUMOR_0,BILATERALITYOFTUMOR_1,BILATERALITYOFTUMOR_2,SIDEOFTUMOR_0,SIDEOFTUMOR_1,SIDEOFTUMOR_2,SIDEOFTUMOR_3,SIDEOFSURGERY_0,SIDEOFSURGERY_1,SIDEOFSURGERY_2,SIDEOFSURGERY_3,FACE_0,FACE_1,FACE_2,TUMORlOCATION_0,TUMORlOCATION_1,TUMORlOCATION_2,TUMORlOCATION_3,TUMORlOCATION_4,TUMORlOCATION_5,POLARLOCATION_0,POLARLOCATION_1,POLARLOCATION_2,RIMLOCATION_0,RIMLOCATION_1,RIMLOCATION_2,RENALSINUS_0,RENALSINUS_1,RENALSINUS_2,EXOPHYTICRATE_0,EXOPHYTICRATE_1,EXOPHYTICRATE_2,EXOPHYTICRATE_3,CLINICALSIZEGROUP_0,CLINICALSIZEGROUP_1,CLINICALSIZEGROUP_2,CLINICALSIZEGROUP_3,CT_0,CT_1,CT_2,CT_3,CT_4,CT_5,CT_6,CN_0,CN_1,CN_2,CN_3,CM_0,CM_1,CM_2,CM_3,CM_4,CM_5,RADIUSmaximaldiameterincm_0,RADIUSmaximaldiameterincm_1,RADIUSmaximaldiameterincm_2,RADIUSmaximaldiameterincm_3,EXOPHYTICENDOPHYTICPROPERTIES_0,EXOPHYTICENDOPHYTICPROPERTIES_1,EXOPHYTICENDOPHYTICPROPERTIES_2,EXOPHYTICENDOPHYTICPROPERTIES_3,EXOPHYTICENDOPHYTICPROPERTIES_4,NEARNESSOFTUMOUR_0,NEARNESSOFTUMOUR_1,NEARNESSOFTUMOUR_2,NEARNESSOFTUMOUR_3,ANTERIORORPOSTERIOR_0,ANTERIORORPOSTERIOR_1,ANTERIORORPOSTERIOR_2,ANTERIORORPOSTERIOR_3,LOCATIONTOPOLARLINE_0,LOCATIONTOPOLARLINE_1,LOCATIONTOPOLARLINE_2,LOCATIONTOPOLARLINE_3,PARTIALNEPHROINDICATION_0,PARTIALNEPHROINDICATION_1,PARTIALNEPHROINDICATION_2,PARTIALNEPHROINDICATION_3,MULTIFOCALITY_0,MULTIFOCALITY_1,MULTIFOCALITY_2
0,58.0,26.87,22.0,13.9,43.0,7200.0,1.08,70.224904,1.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,63.0,32.0,32.0,13.91,41.56,7205.87,0.97,78.1695,1.05,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,73.0,27.78,57.0,14.3,41.56,7205.87,0.99,81.07,1.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,41.0,25.15,30.0,15.7,47.3,5600.0,1.1,81.07,1.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,56.0,28.1,64.0,13.91,41.56,7205.87,0.73,81.07,1.0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [7]:
x_train = nn_train.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
y_train = nn_train['INTRA_OP_COMPLICATIONS'].copy()
x_test = nn_test.drop(labels='INTRA_OP_COMPLICATIONS', axis = 'columns').copy()
y_test = nn_test['INTRA_OP_COMPLICATIONS'].copy()

In [8]:
# Split train and validation set
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.20, random_state=42, stratify = y_train)

In [9]:
print("Number of records in trainset {}".format(len(x_train)))
print("Number of records in validset {}".format(len(x_valid)))
print("Number records in testset {}".format(len(x_test)))
print('% pos labels train {:.2f}'.format(y_train.sum()/len(y_train)))
print('% pos labels valid {:.2f}'.format(y_valid.sum()/len(y_valid)))
print('% pos labels test {:.2f}'.format(y_test.sum()/len(y_test)))

Number of records in trainset 1588
Number of records in validset 397
Number records in testset 852
% pos labels train 0.05
% pos labels valid 0.05
% pos labels test 0.05


In [8]:
# Standardize the numeric columns
from sklearn.preprocessing import StandardScaler
stdc = StandardScaler()
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

#x_valid.reset_index(drop=True, inplace=True)
#y_valid.reset_index(drop=True, inplace=True)

x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

float_col = train.select_dtypes('float').columns
x_train[float_col] = pd.DataFrame(stdc.fit_transform(x_train[float_col]),columns=float_col).copy()
#x_valid[float_col] = pd.DataFrame(stdc.transform(x_valid[float_col]),columns=float_col).copy()
x_test[float_col] = pd.DataFrame(stdc.transform(x_test[float_col]),columns=float_col).copy()

In [9]:
x_train.head()

Unnamed: 0,AGEATSURGERY,BMI,CLINICALSIZEmm,PREOPHB,PREOPHT,PREOPWBC,PREOPCREAT,PREOPEGFR,NOOFLESIONS,GENDER_0,GENDER_1,GENDER_2,MARITALSTATUS_0,MARITALSTATUS_1,MARITALSTATUS_2,MARITALSTATUS_3,MARITALSTATUS_4,RACE_0,RACE_1,RACE_2,RACE_3,RACE_4,EDUCATION_0,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,ECOG_0,ECOG_1,ECOG_2,ECOG_3,ECOG_4,SYMPTOMS_0,SYMPTOMS_1,SYMPTOMS_2,SYMPTOMS_3,SOLITARYKIDNEY_0,SOLITARYKIDNEY_1,SOLITARYKIDNEY_2,SOLITARYKIDNEY_3,BILATERALITYOFTUMOR_0,BILATERALITYOFTUMOR_1,BILATERALITYOFTUMOR_2,SIDEOFTUMOR_0,SIDEOFTUMOR_1,SIDEOFTUMOR_2,SIDEOFTUMOR_3,SIDEOFSURGERY_0,SIDEOFSURGERY_1,SIDEOFSURGERY_2,SIDEOFSURGERY_3,FACE_0,FACE_1,FACE_2,TUMORlOCATION_0,TUMORlOCATION_1,TUMORlOCATION_2,TUMORlOCATION_3,TUMORlOCATION_4,TUMORlOCATION_5,POLARLOCATION_0,POLARLOCATION_1,POLARLOCATION_2,RIMLOCATION_0,RIMLOCATION_1,RIMLOCATION_2,RENALSINUS_0,RENALSINUS_1,RENALSINUS_2,EXOPHYTICRATE_0,EXOPHYTICRATE_1,EXOPHYTICRATE_2,EXOPHYTICRATE_3,CLINICALSIZEGROUP_0,CLINICALSIZEGROUP_1,CLINICALSIZEGROUP_2,CLINICALSIZEGROUP_3,CT_0,CT_1,CT_2,CT_3,CT_4,CT_5,CT_6,CN_0,CN_1,CN_2,CN_3,CM_0,CM_1,CM_2,CM_3,CM_4,CM_5,RADIUSmaximaldiameterincm_0,RADIUSmaximaldiameterincm_1,RADIUSmaximaldiameterincm_2,RADIUSmaximaldiameterincm_3,EXOPHYTICENDOPHYTICPROPERTIES_0,EXOPHYTICENDOPHYTICPROPERTIES_1,EXOPHYTICENDOPHYTICPROPERTIES_2,EXOPHYTICENDOPHYTICPROPERTIES_3,EXOPHYTICENDOPHYTICPROPERTIES_4,NEARNESSOFTUMOUR_0,NEARNESSOFTUMOUR_1,NEARNESSOFTUMOUR_2,NEARNESSOFTUMOUR_3,ANTERIORORPOSTERIOR_0,ANTERIORORPOSTERIOR_1,ANTERIORORPOSTERIOR_2,ANTERIORORPOSTERIOR_3,LOCATIONTOPOLARLINE_0,LOCATIONTOPOLARLINE_1,LOCATIONTOPOLARLINE_2,LOCATIONTOPOLARLINE_3,PARTIALNEPHROINDICATION_0,PARTIALNEPHROINDICATION_1,PARTIALNEPHROINDICATION_2,PARTIALNEPHROINDICATION_3,MULTIFOCALITY_0,MULTIFOCALITY_1,MULTIFOCALITY_2
0,0.859442,0.159461,-1.095741,-2.102967,-1.85101,-2.775703,0.152043,-1.384971,-0.212997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.288479,-0.621134,0.084865,-1.300945,-0.602226,-1.185677,1.525903,-1.341541,-0.212997,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.941009,-0.743352,-1.021953,1.185324,0.535555,0.006843,-0.084829,0.099492,-0.212997,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.267274,-1.7801,0.601381,-0.009689,-0.00281,0.010732,0.95741,-1.023054,-0.212997,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.370045,-0.924572,0.232441,-1.621753,-1.545752,-1.119426,0.057294,-1.243894,-0.212997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [65]:
from sklearn.utils.class_weight import compute_class_weight

In [66]:
weights = compute_class_weight(class_weight='balanced', classes=y_train.unique(), y = y_train)

In [67]:
weights

array([ 0.52596714, 10.12755102])

In [68]:
class_weight = {0: weights[0] , 1: weights[1]}

In [69]:
def nn_model(dropout_rate, neurons, learning_rate):
    from numpy.random import seed
    seed(123)
    from tensorflow.random import set_seed
    set_seed(123)
    tf.keras.backend.clear_session()
   
    # input layer
    input_layer = keras.layers.Input(shape=(x_train.shape[1],), name = "input_layer")
    x = keras.layers.Dense(neurons, name = 'Dense_1',activation='relu')(input_layer)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_1', seed = 42)(x)
    x = keras.layers.Dense(neurons, name = 'Dense_2',activation='relu')(x)
    x = keras.layers.Dropout(dropout_rate, name=  'Dropout_2', seed = 42)(x)
    main_output = keras.layers.Dense(1, activation='sigmoid',name='main_output')(x)

    model = keras.Model(inputs= input_layer, outputs=main_output)

    # compiling the model
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(curve = 'ROC',name = 'AUC_ROC'),
                           tf.keras.metrics.AUC(curve = 'PR', name = 'AUC_PR')],
                  )

    # Keras callback. The patience parameter is the amount of epochs to check for improvement
    
    return model

In [70]:
from keras.wrappers.scikit_learn import KerasClassifier

In [71]:
np.random.seed(1)
tf.random.set_seed(1)
model = KerasClassifier(build_fn=nn_model, verbose=0)
# grid search epochs, batch size and optimizer
parameter_dist = {'classifier__dropout_rate':[0.2,0.3,0.4,0.5],
                  'classifier__epochs':[20],
                  'classifier__neurons':[128],
                  'classifier__learning_rate': [0.01, 0.005, 0.001, 0.0005],
                 }

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('pca',PCA()),
                      ('classifier', model)])
nn_model = GridSearchCV(clf,parameter_dist,n_jobs= 1,scoring= 'roc_auc', cv = 10)

In [73]:
#early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min',restore_best_weights=True)
nn_model.fit(x_train,y_train, classifier__class_weight = class_weight)

KeyboardInterrupt: 

In [74]:
nn_model.cv_results_

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [None]:
pd.DataFrame(nn_model.best_estimator_.model.history.history)[['loss','val_loss']].plot(figsize=(10,8))

In [None]:
# input layer
np.random.seed(1)
tf.random.set_seed(1)
input_layer = keras.layers.Input(shape=(x_train.shape[1],), name = "input_layer")
x = keras.layers.Dense(128, name = 'Dense_1',activation='relu')(input_layer)
x = keras.layers.Dropout(0.5, name=  'Dropout_1', seed = 42)(x)
x = keras.layers.Dense(128, name = 'Dense_2',activation='relu')(x)
x = keras.layers.Dropout(0.5, name=  'Dropout_2', seed = 42)(x)
main_output = keras.layers.Dense(1, activation='sigmoid',name='main_output')(x)

model = keras.Model(inputs= input_layer, outputs=main_output)

# compiling the model
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(curve = 'ROC',name = 'AUC_ROC'),
                       tf.keras.metrics.AUC(curve = 'PR', name = 'AUC_PR')],
              )

# Keras callback. The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min',restore_best_weights=True)


model.summary()

In [None]:
model.fit(x = x_train, y = y_train,
          validation_data = (x_valid, y_valid),
          epochs = 100,
          callbacks = [early_stop],
          class_weight=class_weight
     )

In [52]:
model= nn_model

In [53]:
# predict on test set
pd.DataFrame(model.predict(x_test))

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,1
9,0


In [54]:
results_nn = pd.DataFrame(model.predict(x_test), columns=['pred_prob'])
results_nn['pred_label'] =  results_nn['pred_prob'].apply(lambda x: 1 if x>=0.5 else 0)
results_nn['true_label'] = np.array(y_test)

In [55]:
# NeuralNetwork Score Raw Data
print("\n Model Balanced Accuracy: \n" + str(metrics.balanced_accuracy_score(results_nn['true_label'], results_nn['pred_label'])))
print("\n Confusion Matrix : \n"+str(metrics.confusion_matrix(results_nn['true_label'], results_nn['pred_label'])))
print("\n Classification Report: \n"+ str(metrics.classification_report(results_nn['true_label'], results_nn['pred_label'])))
print("\n AUC-ROC: \n"+ str(metrics.roc_auc_score(results_nn['true_label'], results_nn['pred_prob'])))


def calc_aucpr_data(result):
    y_ACTUAL = result['true_label']
    scores_prob = result['pred_prob']
    yhat = result['pred_label']
    precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
    prc_auc = metrics.auc(recall,precision)
    return prc_auc

print("\n PR-ROC: \n"+ str(calc_aucpr_data(results_nn)))


 Model Balanced Accuracy: 
0.6625220458553792

 Confusion Matrix : 
[[784  26]
 [ 27  15]]

 Classification Report: 
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       810
           1       0.37      0.36      0.36        42

    accuracy                           0.94       852
   macro avg       0.67      0.66      0.66       852
weighted avg       0.94      0.94      0.94       852


 AUC-ROC: 
0.6625220458553792

 PR-ROC: 
0.37734332826225647


In [None]:
def bootstrapped_AUC(result):
    from sklearn.utils import resample
    from tqdm import tqdm

    n_iter = 10000
    roc_auc = list()
    prc_auc = list()


    for i in range(n_iter):
        result_sample = resample(result, n_samples = len(result),random_state=i)
        
        #Calculating AUROC for each sample
        y_ACTUAL= result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        fpr, tpr, thresholds = metrics.roc_curve(y_ACTUAL, scores_prob, pos_label=1)
        roc_auc.append(metrics.auc(fpr, tpr))

        #calculate AUPRC for each sample
        y_ACTUAL = result_sample['true_label']
        scores_prob = result_sample['pred_prob']
        yhat = result_sample['pred_label']
        precision, recall, thresholds = metrics.precision_recall_curve(y_ACTUAL, scores_prob, pos_label=1)
        prc_auc.append(metrics.auc(recall,precision))
    
    return roc_auc, prc_auc

In [None]:
roc_auc_nn, pr_auc_nn = bootstrapped_AUC(results_nn)

In [None]:
dict = {'roc_auc_nn': roc_auc_nn,
        'pr_auc_nn': pr_auc_nn,
       }
pd.DataFrame(dict).describe(percentiles=[0.025,0.975])

In [None]:
output_path = 'output/models'

In [None]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
model.save(filepath=output_path+'/nn_model.h5')

In [None]:
# Export Standcaler used to process the testset
from joblib import dump
dump(encoder, output_path+'/nn_OHE.joblib')

In [None]:
# Export Standcaler used to process the testset
from joblib import dump
dump(stdc, output_path+'/nn_stdc.joblib')