In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

import matplotlib 
matplotlib.use('agg')

import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)
import seaborn as sns; print("Seaborn", sns.__version__)

import matplotlib.pyplot as plt;

import sys
import findspark
import pandas as pd

from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f


from tensorflow.keras import backend as K
import tensorflow as tf; print(tf.__version__)
import pyarrow as pa

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

fs = pa.hdfs.connect()


In [None]:
train_files = []
base_file = "./dashboard/Features_random_fold"


for k in range(8):
    train_files.append(base_file + str(k) + ".parquet")
print(train_files)


val_files = []
val_files.append(base_file + "8.parquet")
print(val_files)


test_files = []
test_files.append(base_file  + "9.parquet")
print(test_files)


from petastorm import make_reader
from petastorm.tf_utils import tf_tensors, make_petastorm_dataset

train_readers = []
train_datasets = []

val_readers = []
val_datasets = []

test_readers = []
test_datasets = []


In [None]:
for train in train_files:
    print('train: ', train)
    r = make_reader(train, hdfs_driver='libhdfs', num_epochs=None, results_queue_size=1024, workers_count=8)
    train_readers.append(r)
    train_datasets.append((make_petastorm_dataset(r)\
                           .map(lambda x:((tf.reshape(x.EMBEDDING[-1000:], (100, 10)),
                                           x.FEATURES),
                                          tf.one_hot(tf.cast(x.readmitLabel, tf.uint8), 2))
                               )
                          )
                         )
    
for val in val_files:
    print('val: ', val)
    r = make_reader(val, hdfs_driver='libhdfs', num_epochs=None, results_queue_size=1024, workers_count=8)
    val_readers.append(r)
    val_datasets.append((make_petastorm_dataset(r)\
                           .map(lambda x:((tf.reshape(x.EMBEDDING[-1000:], (100, 10)),
                                           x.FEATURES),
                                          tf.one_hot(tf.cast(x.readmitLabel, tf.uint8), 2))
                               )
                          )
                         )
    
for test in test_files:
    print('test: ', test)
    r = make_reader(test, hdfs_driver='libhdfs', num_epochs=None, results_queue_size=1024, workers_count=8)
    test_readers.append(r)
    test_datasets.append((make_petastorm_dataset(r)\
                           .map(lambda x:((tf.reshape(x.EMBEDDING[-1000:], (100, 10)),
                                           x.FEATURES),
                                          tf.one_hot(tf.cast(x.readmitLabel, tf.uint8), 2))
                               )
                          )
                         )
    
print(len(train_datasets), len(val_datasets), len(test_datasets))

train_ds = tf.data.experimental.sample_from_datasets(train_datasets)
val_ds = tf.data.experimental.sample_from_datasets(val_datasets)
test_ds = tf.data.experimental.sample_from_datasets(test_datasets)


In [None]:
for test in test_files:
    print('test: ', test)
    r = make_reader(test, hdfs_driver='libhdfs', num_epochs=None, results_queue_size=1024, workers_count=8)
    test_readers.append(r)
    test_datasets.append((make_petastorm_dataset(r)\
                           .map(lambda x:((tf.reshape(x.EMBEDDING[-1000:], (100, 10)),
                                           x.FEATURES),
                                          tf.one_hot(tf.cast(x.readmitLabel, tf.uint8), 2))
                               )
                          )
                         )
test_ds = tf.data.experimental.sample_from_datasets(test_datasets)


In [None]:

import tensorflow.keras as keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, CuDNNGRU, CuDNNLSTM, Dropout, Flatten
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import multi_gpu_model

feat_shape = (1320)
embed_shape = (100, 10)

input_emb = Input(shape=(embed_shape[0], embed_shape[1]))
lstm1 = CuDNNGRU(256, return_sequences=True)(input_emb)
dropout = Dropout(0.01)(lstm1)
lstm2 = CuDNNGRU(128, return_sequences=True)(dropout)
dropout1 = Dropout(0.01)(lstm2)
lstm3 = CuDNNGRU(64)(dropout1)

input_feat = Input(shape=(feat_shape,))
dense1 = Dense(256, activation='relu')(input_feat)
dropout3 = Dropout(0.01)(dense1)
dense2 = Dense(128, activation='relu')(dropout3)
dense_flat = Flatten()(dense2)

merged_vector = keras.layers.concatenate([dense_flat, lstm3], axis=-1)
dense3 = Dense(128, activation='relu')(merged_vector)

output = Dense(2, activation='softmax')(dense3)
model = Model(inputs=[input_emb, input_feat],outputs=[output])

model = multi_gpu_model(model, gpus=4, cpu_merge=True, cpu_relocation=False)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from sklearn.utils import class_weight
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        
        self.auc_list = []
        self.epoch_list = []

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            tf.contrib.summary.scalar('auc', tensor=score, step=epoch)
            #print("\n interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score))
            
            fpr_keras, tpr_keras, thresholds_keras = roc_curve(np.argmax(self.y_val, axis=1), 
                                                               y_pred[:, 1])
            #print(fpr_keras, tpr_keras)
            
            auc_keras = auc(fpr_keras, tpr_keras)
            print(auc_keras)
            
            plt.figure(int(epoch/10))
            
            plt.plot(fpr_keras, tpr_keras, label= str(epoch) + ' (area = {:.3f})'.format(auc_keras))
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel('False positive rate')
            plt.ylabel('True positive rate')
            plt.title('ROC curve')
            plt.legend(loc='best')
            #plt.show()
            plt.savefig('./outputs/val_nb_readmit_auc_randomval' + str(epoch) + '.png')
            
            self.auc_list.append(score)
            self.epoch_list.append(epoch)
            plt.figure()
            plt.plot(self.epoch_list, self.auc_list)
            plt.savefig('./outputs/val_nb_readmit_auc_track_randomval.png')

#ival = IntervalEvaluation(validation_data=(sess.run(val_ds.batch(10000).make_one_shot_iterator().get_next())), 
#                          interval=1)

In [None]:
modelpath = './models/READMIT_val_best_randomval.hdf5'
callbacks = [EarlyStopping(monitor='val_loss', patience=100), 
#             ival, 
#              tb,
             ModelCheckpoint(modelpath, monitor='val_loss', verbose=0, 
                             save_best_only=True, save_weights_only=False, mode='auto', period=1),
            ]


In [None]:
model.fit(train_ds.batch(128).prefetch(buffer_size=1), 
          steps_per_epoch=1000, epochs=1000,
          validation_data=val_ds.batch(1000), validation_steps=10,
          callbacks=callbacks)


In [None]:
model.load_weights(modelpath)
sample = sess.run(test_ds.batch(10000000).make_one_shot_iterator().get_next())
y_pred_keras = model.predict(sample[0])

score = roc_auc_score(sample[1], y_pred_keras)

fpr_keras, tpr_keras, thresholds_keras = roc_curve(sample[1][:,1], y_pred_keras[:,1])

#print(fpr_keras, tpr_keras)

auc_keras = auc(fpr_keras, tpr_keras)
print(auc_keras)


In [None]:
%matplotlib inline
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))


In [None]:
score