# LSTM Model using Clinical + Therapy sequence

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.metrics import AUC, SensitivityAtSpecificity
from tensorflow.keras.optimizers import Adam, Adagrad, RMSprop, Adamax
from tensorflow.keras.initializers import Constant

#internal validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score, balanced_accuracy_score, matthews_corrcoef, auc, average_precision_score, roc_auc_score, balanced_accuracy_score, roc_curve, accuracy_score

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import pickle

# fix random seed for reproducibility
tf.random.set_seed(1234)

target_outcome = 'outcome_combined_12months'
max_codes = 150

2023-10-26 11:12:42.514891: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-26 11:12:42.820312: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-26 11:12:42.820367: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-26 11:12:42.821127: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-26 11:12:42.893395: I tensorflow/core/platform/cpu_feature_g

In [2]:
# load the dataset but only keep the top n words, zero the rest
data = pickle.load(open('../SeqModel/data_all_big.sav', 'rb'))
code2idx = pickle.load(open('../SeqModel/code2idx_all_big.sav', 'rb'))
idx2code = pickle.load(open('../SeqModel/idx2code_all_big.sav', 'rb'))

vocab_size = len(code2idx)
vocab_size

45211

In [3]:
# #Data split conventional (mixed countries)
# trainingData, testData = train_test_split(data, test_size=0.1, stratify=data[target_outcome], random_state=1234)
# trainingData, valData = train_test_split(trainingData, test_size=0.2, stratify=trainingData[target_outcome], random_state=1234)
# print(trainingData.shape)
# print(valData.shape)
# print(testData.shape)

In [4]:
#Data split, train=England, eval=Scot+Wales
trainingData = data[(data.Country == 'England') & (data.age >= 18)]
trainingData, valData = train_test_split(trainingData, test_size=0.2, stratify=trainingData[target_outcome], random_state=1234)
trainingData, evalData = train_test_split(trainingData, test_size=0.2, stratify=trainingData[target_outcome], random_state=1234)
testData = data[((data.Country == 'Wales') | (data.Country == 'Scotland')) & (data.age >= 18)]
testDataWales = data[(data.Country == 'Wales') & (data.age >= 18)]
testDataScotland = data[(data.Country == 'Scotland') & (data.age >= 18)]

In [5]:
print('Train: ', trainingData.shape[0])
print('Val: ', valData.shape[0])
print('Eval (internal validation): ', evalData.shape[0])
print('Test: ', testData.shape[0])
print('Test - Wales: ', testDataWales.shape[0])
print('Test - Scotland: ', testDataScotland.shape[0])

Train:  46410
Val:  14504
Eval (internal validation):  11603
Test:  2542
Test - Wales:  1804
Test - Scotland:  738


In [6]:
print(trainingData.patid.unique().shape)
print(trainingData.shape)

(46410,)
(46410, 14)


In [7]:
#make sure no data leak between sets
print(list(set(trainingData.patid.values).intersection(set(valData.patid.values))))
print(list(set(trainingData.patid.values).intersection(set(evalData.patid.values))))
print(list(set(valData.patid.values).intersection(set(evalData.patid.values))))
print(list(set(valData.patid.values).intersection(set(testData.patid.values))))
print(list(set(trainingData.patid.values).intersection(set(testData.patid.values))))
print(list(set(testData.patid.values).intersection(set(testDataScotland.patid.values)))) # here data leak is expected

[]
[]
[]
[]
[]
[43487233, 43487235, 43464708, 43139079, 52082696, 43378699, 52336655, 43139096, 43165721, 52068381, 43378721, 43165731, 43165732, 43487269, 43165737, 43165746, 43325496, 43311166, 51984450, 43464772, 52072518, 51984457, 43227211, 52064331, 52064334, 43466834, 43378773, 43311191, 43339865, 43139165, 43284586, 43374709, 51894397, 52197514, 51943563, 52050060, 43139214, 51914900, 43315349, 52285590, 43165847, 43430042, 51992734, 43442340, 43145381, 43327661, 51894446, 43350191, 52302006, 52097207, 43491513, 43315386, 43135161, 43145404, 52279481, 52035777, 43253954, 43430083, 43192522, 43190477, 43339982, 43190478, 43253966, 51984594, 43303126, 43423963, 43430108, 43426012, 51869915, 51914974, 51861728, 52035808, 43430118, 52295910, 51984618, 43180267, 43251950, 43317488, 52035831, 43141371, 43340029, 43317502, 43190531, 43317508, 43190533, 43204870, 43491591, 43340044, 43317518, 43430159, 43190548, 43317526, 52085017, 43315483, 43208987, 43315486, 43317538, 43254052, 5221

In [8]:
print(trainingData[target_outcome].value_counts(normalize=True))
print(valData[target_outcome].value_counts(normalize=True))
print(evalData[target_outcome].value_counts(normalize=True))
print(testData[target_outcome].value_counts(normalize=True))
print(testDataWales[target_outcome].value_counts(normalize=True))
print(testDataScotland[target_outcome].value_counts(normalize=True))

0    0.831803
1    0.168197
Name: outcome_combined_12months, dtype: float64
0    0.831771
1    0.168229
Name: outcome_combined_12months, dtype: float64
0    0.831854
1    0.168146
Name: outcome_combined_12months, dtype: float64
0    0.74705
1    0.25295
Name: outcome_combined_12months, dtype: float64
0    0.752217
1    0.247783
Name: outcome_combined_12months, dtype: float64
0    0.734417
1    0.265583
Name: outcome_combined_12months, dtype: float64


In [9]:
# X and y
X_train = np.array(trainingData.read_code_seq_padded_idx.values)
X_train = np.array([x for x in X_train])
X_val = np.array(valData.read_code_seq_padded_idx.values)
X_val = np.array([x for x in X_val])
X_eval = np.array(evalData.read_code_seq_padded_idx.values)
X_eval = np.array([x for x in X_eval])
X_test = np.array(testData.read_code_seq_padded_idx.values)
X_test = np.array([x for x in X_test])
X_testWales = np.array(testDataWales.read_code_seq_padded_idx.values)
X_testWales = np.array([x for x in X_testWales])
X_testScotland = np.array(testDataScotland.read_code_seq_padded_idx.values)
X_testScotland = np.array([x for x in X_testScotland])

y_train = trainingData[target_outcome].values
y_val = valData[target_outcome].values
y_eval = evalData[target_outcome].values
y_test = testData[target_outcome].values
y_testWales = testDataWales[target_outcome].values
y_testScotland = testDataScotland[target_outcome].values

In [10]:
print('Train: ', X_train.shape[0])
print('Val: ', X_val.shape[0])
print('Eval (internal validation): ', X_eval.shape[0])
print('Test: ', X_test.shape[0])
print('Test - Wales: ', X_testWales.shape[0])
print('Test - Scotland: ', X_testScotland.shape[0])

Train:  46410
Val:  14504
Eval (internal validation):  11603
Test:  2542
Test - Wales:  1804
Test - Scotland:  738


In [11]:
pos_weight = trainingData[target_outcome].value_counts()[0]/trainingData[target_outcome].value_counts()[1]
neg_weight = trainingData[target_outcome].value_counts()[1]/trainingData[target_outcome].value_counts()[0]
class_weight = {0:1, 1:pos_weight}
print(class_weight)

{0: 1, 1: 4.945426594926979}


In [12]:
output_bias = np.array([np.log(neg_weight)])
output_bias = Constant(output_bias)

In [13]:
%%time

# create the model
embedding_vector_length = 300
earlyStopping = EarlyStopping(monitor='val_auc', patience=10, verbose=0, mode='max', restore_best_weights=True)
mcp_save = ModelCheckpoint('../SeqModel/seqModel_all_new.mdl_wts.hdf5', save_best_only=True, monitor='val_auc', mode='min')


with tf.device('/CPU:0'):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_codes))
    model.add(LSTM(100, return_sequences=True, dropout=0.3, recurrent_dropout=0.5, recurrent_activation='relu'))
    model.add(LSTM(100, dropout=0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid', bias_initializer=output_bias))
    opt = Adam(learning_rate=0.0001)
    metrics = [
        AUC(num_thresholds=10000, name='auc', curve='ROC'),
    ]
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=metrics, )
    print(model.summary())
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=100, class_weight = class_weight, callbacks = [earlyStopping, mcp_save])



2023-10-26 11:14:45.033098: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-26 11:14:45.261561: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-26 11:14:45.261616: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-26 11:14:45.267862: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-26 11:14:45.267916: I tensorflow/compile

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          13563300  
                                                                 
 lstm (LSTM)                 (None, 150, 100)          160400    
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 128)               12928     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 13817157 (52.71 MB)
Trainable params: 13817157 (52.71 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/1

2023-10-26 11:14:50.394515: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f0c89290fb0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-10-26 11:14:50.394566: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2023-10-26 11:14:50.407848: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-26 11:14:50.432695: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2023-10-26 11:14:50.434251: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2023-10-26 11:14:50.441138: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.


  2/465 [..............................] - ETA: 2:33 - loss: 1.2629 - auc: 0.4422 

2023-10-26 11:14:50.862595: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.


  3/465 [..............................] - ETA: 2:40 - loss: 1.4206 - auc: 0.5238

2023-10-26 11:14:51.211345: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.




  saving_api.save_model(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

KeyboardInterrupt: 

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
# plt.ylim(0.55,1)
plt.title('model AUC')
plt.ylabel('AUC')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
# plt.ylim(0.1, 1.15)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
with tf.device('/CPU:0'):
    model.evaluate(X_eval, y_eval)
    model.evaluate(X_test, y_test)
    model.evaluate(X_testWales, y_testWales)
    model.evaluate(X_testScotland, y_testScotland)

In [None]:
#Model evaluation function
def summariseResult (testY, preds):
    tn, fp, fn, tp = confusion_matrix(testY, preds).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    ppv = 100*tp/(tp+fp)
    npv = 100*tn/(fn+tn)
    acc = accuracy_score(testY, preds)
    f1score = f1_score(testY, preds, average = 'binary')
    balanceacc = balanced_accuracy_score(testY, preds)
    fpr, tpr, thresholds = roc_curve(testY, preds, pos_label=1)
    aucscore = auc(fpr, tpr)
    # aucscore = roc_auc_score(testY, preds)
    auprc = average_precision_score(testY, preds)
    # plot_confusion_matrix(model, testX, testY, cmap='viridis')  
    return np.round(acc,4), np.round(specificity,4), np.round(sensitivity,4), np.round(aucscore,4), np.round(auprc,4), np.round(balanceacc,4), np.round(f1score,4), np.round(ppv,4), np.round(npv,4)

data_test_Xs = [X_eval, X_test, X_testWales, X_testScotland]
data_test_ys = [y_eval, y_test, y_testWales, y_testScotland]
for data_test_X, data_test_y in zip(data_test_Xs, data_test_ys):
    with tf.device('/CPU:0'):
        preds = model.predict(data_test_X)
    preds = [0 if pred <0.5 else 1 for pred in preds]
    print(summariseResult(data_test_y, np.squeeze(preds)))

In [None]:
model.save('../SeqModel/model_all.h5')