# LSTM Model using Therapy sequence

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, Input, concatenate, Reshape, Activation, Flatten, Add, BatchNormalization, Multiply, LeakyReLU
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.metrics import AUC, SensitivityAtSpecificity
from tensorflow.keras.optimizers import Adam, Adagrad, RMSprop, Adamax, SGD, Adadelta
from tensorflow.keras.initializers import Constant
from tensorflow.keras.regularizers import L1L2, L1, L2
from livelossplot import PlotLossesKeras
#internal validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, balanced_accuracy_score, matthews_corrcoef, auc, average_precision_score, roc_auc_score, balanced_accuracy_score, roc_curve, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import pickle

# fix random seed for reproducibility
tf.random.set_seed(1234)



# RUN

In [None]:
sets =  pickle.load(open('../SeqModel/sets_search_long.sav', 'rb'))
sets_eval = pickle.load(open('../SeqModel/sets_search_long_eval.sav', 'rb'))
code2idx = pickle.load(open('../SeqModel/all_vocab.sav', 'rb'))
month2idx = pickle.load(open('../SeqModel/all_vocab_month.sav', 'rb'))
vocab_size = len(code2idx)+1
month_size = len(month2idx)+1
print(vocab_size)
print(month_size)

In [None]:
Xt_train, Xt_val, Xt_eval, Xs_train, Xs_val, Xs_eval, Xm_train, Xm_val, Xm_eval, y_train, y_val, y_eval = sets
Xt_test, Xt_testWales, Xt_testScotland, Xs_test, Xs_testWales, Xs_testScotland, Xm_test, Xm_testWales, Xm_testScotland, y_test, y_testWales, y_testScotland = sets_eval

In [None]:
print('Train: ', Xs_train.shape[0])
print('Val: ', Xs_val.shape[0])
print('Eval (internal validation): ', Xs_eval.shape[0])
print('Test: ', Xs_test.shape[0])
print('Test - Wales: ', Xs_testWales.shape[0])
print('Test - Scotland: ', Xs_testScotland.shape[0])

In [None]:
pos_weight = sum(x == 0 for x in y_train)/sum(x == 1 for x in y_train)
class_weight = {0:1, 1:pos_weight}
print(class_weight)

In [None]:
np.cbrt(vocab_size)

In [None]:
target_outcome = '12months'
max_codes = 750
tab_feature_size = Xt_train.shape[1]

In [None]:
def earlyFussion():
       
    # inputs1 = Input(shape=tab_feature_size)
    inputs2 = Input(shape=max_codes)
    inputs3 = Input(shape=max_codes)
    
    
    #clinical embedding for lstm
    embedding = Embedding(vocab_size, 75, input_length=max_codes)(inputs2)
    
    #month embedding for lstm
    embedding_month = Embedding(month_size, 7, input_length=max_codes)(inputs3)
    
    # nn = Dense(32, activation='relu', kernel_initializer='glorot_uniform', kernel_regularizer=L1L2(l1=0.0, l2=0.1))(inputs1)
    # nn = Dropout(0.5)(nn)
    lstmClinical = Bidirectional(LSTM(units=16, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.1)))(embedding)
    lstmMonth = Bidirectional(LSTM(units=16, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.1)))(embedding_month)
    # lstm = Add()([lstmClinical, lstmMonth])
    lstm = lstmClinical
    
    # nn = Reshape((1, 32))(nn)
    # add = concatenate([nn, lstm], axis=1)
    # nn = Dense(16, activation='relu', kernel_initializer='glorot_uniform', kernel_regularizer=L1L2(l1=0.0, l2=0.1))(nn)
    # nn = Dropout(0.5)(nn)
    lstm = Bidirectional(LSTM(units=8, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.1)))(lstm)
    lstm = Dropout(0.5)(lstm)
    
    # # nn = Reshape((301, 64))(nn)
    # add = concatenate([nn, lstm], axis=1)
    # nn = Dense(16, activation='relu', kernel_initializer='glorot_uniform', kernel_regularizer=L1L2(l1=0.0, l2=0.1))(nn)
    # nn = Dropout(0.5)(nn)
    lstm = Bidirectional(LSTM(units=8, return_sequences=True, kernel_regularizer=L1L2(l1=0.0, l2=0.1)))(lstm)
    lstm = Dropout(0.5)(lstm)
    
    # nn = Reshape((1, 16))(nn)
    # model_tot = concatenate([nn, lstm], axis=1)
    # model_tot = BatchNormalization()(model_tot)

    model_tot = Dense(units=8, activation=LeakyReLU())(lstm)
    model_tot = Dropout(0.5)(model_tot)
    
    model_tot = Flatten()(model_tot)
    output = Dense(1, activation='sigmoid')(model_tot)
    
    opt = Adadelta(learning_rate=8e-5, clipvalue=.5)
    metrics = [
        AUC(num_thresholds=1000, name='auc', curve='ROC'),
        AUC(num_thresholds=1000, name='auprc', curve='PR'),
        tf.keras.metrics.Precision(name='prec'),
        tf.keras.metrics.Recall(name='rec'),
        tf.keras.metrics.TrueNegatives(name='TN'),
        tf.keras.metrics.TruePositives(name='TP'),
        tf.keras.metrics.PrecisionAtRecall(0.8)
    ]
    
    loss = tf.keras.losses.BinaryCrossentropy()
    
    model = Model(inputs=[inputs2, inputs3], outputs=output)
    model.compile(
        loss='binary_crossentropy', 
        optimizer=opt, 
        metrics=metrics)
    return model

In [None]:
# from sklearn.utils import class_weight
# sklearn_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weight = dict(enumerate(sklearn_weights))

#Hyperparameter
lr = 1e-5
clipvalue = 0.2
epoch = 1000
batch_size = 256
embedding_vector_length = 50
month_embedding_vector_length = 5
# embedding_vector_length = int(np.sqrt(vocab_size))
# embedding_vector_length = int(np.cbrt(vocab_size))
print(embedding_vector_length)

In [None]:
#visualise model
model = earlyFussion()
# model = earlyFussion()
print(model.summary())



In [None]:
%%time
# TF_GPU_ALLOCATOR=cuda_malloc_async
#training
with tf.device('/GPU:0'):
    earlyStopping = EarlyStopping(monitor='val_auc', patience=100, verbose=0, mode='max', restore_best_weights=True)
    mcp_save = ModelCheckpoint('../SeqModel/seqModel_therapy_tabSeq.mdl_wts.hdf5', save_best_only=True, monitor='val_auc', mode='min')
    history = model.fit([Xs_train[:,:max_codes], Xm_train[:,:max_codes]], y_train, validation_data=([Xs_val[:,:max_codes], Xm_val[:,:max_codes]], y_val), 
                            epochs=epoch, batch_size=128, 
                        class_weight = class_weight, 
                        callbacks = [earlyStopping, mcp_save])

In [None]:
# %%time

# # create the model
# embedding_vector_length = 50
# earlyStopping = EarlyStopping(monitor='val_auc', patience=10, verbose=0, mode='max', restore_best_weights=True)
# mcp_save = ModelCheckpoint('../SeqModel/seqModel_therapy.mdl_wts.hdf5', save_best_only=True, monitor='val_auc', mode='min')

# with tf.device('/GPU:0'):
#     model = Sequential()
#     model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_codes))
#     model.add(LSTM(128, return_sequences=True, kernel_regularizer=L1L2(l1=0.02, l2=0.03)))
#     model.add(Dropout(0.5))
#     model.add(LSTM(64,  kernel_regularizer=L1L2(l1=0.02, l2=0.03)))
#     model.add(Dropout(0.5))
#     model.add(Dense(32, activation=LeakyReLU(alpha=.3), kernel_regularizer=L1L2(l1=0.02, l2=0.03)))
#     model.add(Dense(1, activation='sigmoid'))
#     opt = Adadelta(learning_rate=5e-3, clipvalue=0.3)
#     metrics = [
#         AUC(num_thresholds=3, name='auc'),
#     ]
#     model.compile(loss='binary_crossentropy', optimizer=opt, metrics=metrics, )
#     print(model.summary())
#     history = model.fit(Xs_train, y_train, validation_data=(Xs_val, y_val), epochs=30, batch_size=128, class_weight = class_weight, callbacks = [earlyStopping, mcp_save])

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('model AUC')
plt.ylabel('AUC')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
# plt.ylim(0.3, 1)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.plot(history.history['auprc'])
plt.plot(history.history['val_auprc'])
plt.title('model auprc')
# plt.ylim(0.3, 1)
plt.ylabel('auprc')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
preds = model.predict([Xs_eval[:,:max_codes], Xm_eval[:,:max_codes]])

In [None]:
preds_smooth = [0 if x < .48 else 1 for x in preds]
print(confusion_matrix(y_eval, preds_smooth))
print(roc_auc_score(y_eval, preds_smooth))

In [None]:
fpr, tpr, thresholds = roc_curve(y_eval, preds, pos_label=1, drop_intermediate=False)
auc(fpr, tpr)

In [None]:
with tf.device('/GPU:0'):
    model.evaluate([Xt_eval, Xs_eval[:,:max_codes], Xm_eval[:,:max_codes]], y_eval)
    model.evaluate([Xt_test, Xs_test[:,:max_codes], Xm_test[:,:max_codes]], y_test)
    # model.evaluate(X_testWales, y_testWales)
    # model.evaluate(X_testScotland, y_testScotland)

In [None]:
#Model evaluation function
def summariseResult (testY, preds):
    tn, fp, fn, tp = confusion_matrix(testY, preds).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    ppv = 100*tp/(tp+fp)
    npv = 100*tn/(fn+tn)
    acc = accuracy_score(testY, preds)
    f1score = f1_score(testY, preds, average = 'binary')
    balanceacc = balanced_accuracy_score(testY, preds)
    fpr, tpr, thresholds = roc_curve(testY, preds, pos_label=1)
    aucscore = auc(fpr, tpr)
    # aucscore = roc_auc_score(testY, preds)
    auprc = average_precision_score(testY, preds)
    # plot_confusion_matrix(model, testX, testY, cmap='viridis')  
    return np.round(acc,4), np.round(specificity,4), np.round(sensitivity,4), np.round(aucscore,4), np.round(auprc,4), np.round(balanceacc,4), np.round(f1score,4), np.round(ppv,4), np.round(npv,4)

data_test_Xs = [X_eval, X_test, X_testWales, X_testScotland]
data_test_ys = [y_eval, y_test, y_testWales, y_testScotland]
for data_test_X, data_test_y in zip(data_test_Xs, data_test_ys):
    with tf.device('/CPU:0'):
        preds = model.predict(data_test_X)
    preds = [0 if pred <0.5 else 1 for pred in preds]
    print(summariseResult(data_test_y, np.squeeze(preds)))

In [None]:
model.save('../SeqModel/model_therapy.h5')

In [None]:
# from keras.models import load_model
# a = load_model('../SeqModel/my_model.h5')