In [1]:
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib

# Model selection
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import make_scorer

# Data transformation pipelines
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler

# Graphics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# TensorFlow 
import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, Nadam
from keras.layers import Input, Embedding, Reshape, GlobalAveragePooling1D
from keras.layers import Flatten, concatenate, Concatenate, Lambda, Dropout, SpatialDropout1D
from keras.layers import Reshape, MaxPooling1D,BatchNormalization, AveragePooling1D, Conv1D
from keras.layers import Activation, LeakyReLU
from keras.optimizers import SGD, Adam, Nadam
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.regularizers import l2, l1_l2
from keras.losses import binary_crossentropy
from keras.utils import get_custom_objects
from keras.layers import Activation, LeakyReLU
from keras.models import load_model

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [5]:
df = pd.read_csv('cleaned-outcomes-no-hypertension.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# df.rename(columns = {'outcome_hypertension':'hypertension'}, inplace=True)
df.head()

Unnamed: 0,30850-0.0,30780-0.0,30690-0.0,1488-0.0,30790-0.0,1418-0.0,1329-0.0,4079-0.0,1220-0.0,23101-0.0,...,outcome_heart_failure,hypertension,outcome_peripheral_vascular_disease,outcome_cardiac_arrest,outcome_cerebral_infarction,outcome_arrhythmia,age,multi-labels,gender,race
0,-1.279446,0.552978,0.856104,0.772987,0.160138,3,2,-0.492958,0,-0.947164,...,0,0,0,0,0,1,54,"[0, 0, 0, 0, 0, 0, 0, 1]",Female,British
1,0.868954,0.158768,0.079159,-0.539868,-0.613795,2,2,0.775257,0,1.556107,...,0,1,0,0,0,0,65,"[1, 0, 1, 0, 0, 0, 0, 0]",Male,British
2,0.541023,0.331835,0.296342,-1.196296,-0.297654,2,1,1.499951,0,1.309186,...,0,1,0,1,1,1,55,"[0, 0, 1, 0, 0, 1, 1, 1]",Male,British
3,-1.060849,-0.519316,0.121831,-1.196296,-0.146259,2,2,-1.036478,0,-1.37289,...,0,0,0,0,0,1,49,"[0, 0, 0, 0, 0, 0, 0, 1]",Female,Irish
4,-1.237093,-0.751772,-0.590704,1.101201,-0.822833,3,2,-0.855305,1,-0.836475,...,0,1,0,0,0,0,61,"[1, 0, 0, 0, 0, 0, 0, 0]",Female,British


In [27]:
outcomes = ['outcome_myocardial_infarction','outcome_cardiomyopathies','outcome_ischemic_heart_disease','outcome_heart_failure','outcome_peripheral_vascular_disease','outcome_cardiac_arrest','outcome_cerebral_infarction','outcome_arrhythmia']


categorical_cols = ['1418','1329','1220','1428','1249','1349','1369','20117','2100','2654','1339','21000','2050','1408','1200','1538','31','6138','1359','20491','1389','1478','2090','20414','1508','1379','6142','1468','1548','20403','1239','1448', 'hypertension']
numerical_cols = ['1488','4079','1299','21003','1160','1438','4080','1458','1528','1319','845','1289','1309']
continuous_cols = ['30850','30780','30690','30790','23101','23099','48','23100','30710','30760','30640','30750','49','30770','30740','30630','30870','21001']

categorical_cols = [s + '-0.0' for s in categorical_cols]
numerical_cols = [s + '-0.0' for s in numerical_cols]
continuous_cols = [s + '-0.0' for s in continuous_cols]

In [24]:
print( 'total records: %d \nhypertension records: %d' % (df.size, df[df['hypertension'] == 1.0].size))

total records: 2134239 
hypertension records: 1330182


In [28]:
# remove rows where there is only hypertension

df2 = df[df[outcomes] != 0.0]

In [30]:
df.head(100)

Unnamed: 0,30850-0.0,30780-0.0,30690-0.0,1488-0.0,30790-0.0,1418-0.0,1329-0.0,4079-0.0,1220-0.0,23101-0.0,...,1448-0.0,outcome_myocardial_infarction,outcome_cardiomyopathies,outcome_ischemic_heart_disease,outcome_heart_failure,hypertension,outcome_peripheral_vascular_disease,outcome_cardiac_arrest,outcome_cerebral_infarction,outcome_arrhythmia
0,-1.279446,0.552978,0.856104,0.772987,0.160138,3.0,2.0,-0.492958,0.0,-0.947164,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.868954,0.158768,0.079159,-0.539868,-0.613795,2.0,2.0,0.775257,0.0,1.556107,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.567810,-0.354348,-0.310521,0.444774,-0.070011,2.0,3.0,0.141149,0.0,1.981834,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.541023,0.331835,0.296342,-1.196296,-0.297654,2.0,1.0,1.499951,0.0,1.309186,...,3.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
4,-1.060849,-0.519316,0.121831,-1.196296,-0.146259,2.0,2.0,-1.036478,0.0,-1.372890,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.106618,1.451734,1.240148,0.444774,-0.430911,3.0,2.0,0.141149,1.0,-0.546981,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,1.703039,0.051645,-0.144665,-1.196296,-0.657058,2.0,0.0,1.318777,1.0,0.985634,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
97,0.508269,0.053788,-0.131783,0.772987,-0.689701,5.0,1.0,-0.402371,1.0,0.926032,...,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
98,0.690149,0.468351,0.622618,-1.196296,2.467697,6.0,1.0,2.043471,0.0,1.751941,...,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


### myocardial infarction model

### Set of binary classifiers

In [None]:
x = df.loc[:,:-9]
y_vect = df.loc[:,-9:]

### Build Model

### Optimize Parameters

### Evaluate Performance

### Save Model