In [1]:
# Import libraries
import psycopg2
import getpass
import matplotlib.pyplot as plt
import numpy as np
import os.path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Define feature file names
feature_files = ['admission_type', 'age', 'aids_haem_mets', 'bicarb', 'bilirubin', 'blood_pressure', 'fio2', \
            'gcs_eyes', 'gcs_motor', 'gcs_verbal', 'heart_rate', 'pao2', 'potassium', 'sodium', 'temperature', 'urea', 'urine', 'wbc']

# Define processed feature names
feature_names = ['admission_type', 'age', 'aids_haem_mets', 'bicarb_24h', 'bilirubin_24h', 'bp_24h', 'fio2_24h', \
            'gcs_eyes_24h', 'gcs_motor_24h', 'gcs_verbal_24h', 'hr_24h', 'pao2_24h', 'potassium_24h', 'sodium_24h', 'temp_24h', 'urea_24h', 'urine_24h', 'wbc_24h']

In [3]:
# Load the data
for i in range(len(feature_files)):
    _data = np.load('res/{}.npy'.format(feature_files[i]), allow_pickle=True).tolist()
    exec("{} = _data['{}']".format(feature_names[i], feature_names[i]))
print("All files loaded!")

All files loaded!


In [4]:
# Check shape of files
for i in range(len(feature_files)):
    print("{}: ".format(feature_names[i]))
    exec("shape = np.shape({})\nprint(shape)".format(feature_names[i]))
    print("")

admission_type: 
(38549, 4)

age: 
(38549, 2)

aids_haem_mets: 
(38549, 4)

bicarb_24h: 
(38549, 25, 1)

bilirubin_24h: 
(38549, 25, 1)

bp_24h: 
(38549, 25, 1)

fio2_24h: 
(38549, 25, 1)

gcs_eyes_24h: 
(38549, 25, 1)

gcs_motor_24h: 
(38549, 25, 1)

gcs_verbal_24h: 
(38549, 25, 1)

hr_24h: 
(38549, 25, 1)

pao2_24h: 
(38549, 25, 1)

potassium_24h: 
(38549, 25, 1)

sodium_24h: 
(38549, 25, 1)

temp_24h: 
(38549, 25, 1)

urea_24h: 
(38549, 25, 1)

urine_24h: 
(38549, 25, 1)

wbc_24h: 
(38549, 25, 1)



In [5]:
# Broadcast static features so they have 24 time steps
sched_surg_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,1], (24,1)).T))
unsched_surg_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,2], (24,1)).T))
medical_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,3], (24,1)).T))
age_24h = np.column_stack((age[:,0], np.tile(age[:,1], (24,1)).T))
aids_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,1], (24,1)).T))
haem_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,2], (24,1)).T))
mets_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,3], (24,1)).T))

# Check the dimensions are correct
print("sched_surg_24h: ", np.shape(sched_surg_24h))
print("unsched_surg_24h: ", np.shape(unsched_surg_24h))
print("medical_24h: ", np.shape(medical_24h))
print("age_24h: ", np.shape(age_24h))
print("aids_24h: ", np.shape(aids_24h))
print("haem_24h: ", np.shape(haem_24h))
print("mets_24h: ", np.shape(mets_24h))

sched_surg_24h:  (38549, 25)
unsched_surg_24h:  (38549, 25)
medical_24h:  (38549, 25)
age_24h:  (38549, 25)
aids_24h:  (38549, 25)
haem_24h:  (38549, 25)
mets_24h:  (38549, 25)


In [6]:
# Stack all features into one array
dataset = np.dstack((sched_surg_24h, unsched_surg_24h, medical_24h, age_24h, aids_24h, haem_24h, mets_24h, bicarb_24h, bilirubin_24h, bp_24h, \
                   fio2_24h, gcs_eyes_24h, gcs_motor_24h, gcs_verbal_24h, hr_24h, pao2_24h, potassium_24h, sodium_24h, temp_24h, urea_24h, urine_24h, wbc_24h))

# Remove the patient IDs
dataset = dataset[:,1:,:]

# Transpose dataset so it is in the shape (m,n,T)
dataset = np.transpose(dataset, (0,2,1))

# Check the shape of the dataset
print("dataset: ", np.shape(dataset))

dataset:  (38549, 22, 24)


In [7]:
# Load the mortality data
_data = np.load('res/mortality.npy', allow_pickle=True).tolist()
mortality = _data['mortality']

# Remove the patient IDs
mortality = mortality[:,1]

# Check the shape of the mortality
print("mortality: ", np.shape(mortality))

mortality:  (38549,)


In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(dataset, mortality, train_size=0.8, random_state=42)

# Split the test data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=42)

# Check the shapes of the outputs
print("X_train: {}".format(np.shape(X_train)))
print("y_train: {}".format(np.shape(y_train)))
print("X_val: {}".format(np.shape(X_val)))
print("y_val: {}".format(np.shape(y_val)))
print("X_test: {}".format(np.shape(X_test)))
print("y_test: {}".format(np.shape(y_test)))

X_train: (30839, 22, 24)
y_train: (30839,)
X_val: (3855, 22, 24)
y_val: (3855,)
X_test: (3855, 22, 24)
y_test: (3855,)


In [9]:
# Print information about the created arrays
print("No. of patients: {}".format(len(dataset)))
print("No. of patients in train, val, test sets: {}, {}, {}".format(len(X_train), len(X_val), len(X_test)))
print("No. of patients who died in hospital in train, val, test sets: {}, {}, {}".format(str(sum(y_train)), str(sum(y_val)), str(sum(y_test))))
print("Proportion of patients who died in hospital in train, val, test sets: {}%, {}%, {}%".format(round(sum(y_train)/len(y_train)*100,3), round(sum(y_val)/len(y_val)*100,3), round(sum(y_test)/len(y_test)*100,3)))

No. of patients: 38549
No. of patients in train, val, test sets: 30839, 3855, 3855
No. of patients who died in hospital in train, val, test sets: 3458.0, 417.0, 445.0
Proportion of patients who died in hospital in train, val, test sets: 11.213%, 10.817%, 11.543%


In [10]:
# Define shape
m, n, T = np.shape(X_train)
X_train_norm = np.zeros((m,n,T))
    
# Iterate through each feature
for i in range(n):
    X_train_norm[:,i,:] = MinMaxScaler().fit_transform(X_train[:,i,:])
    
# Print the shape of the normalised array
print("X_train_norm: {}".format(np.shape(X_train_norm)))

# Define shape
m, n, T = np.shape(X_val)
X_val_norm = np.zeros((m,n,T))
    
# Iterate through each feature
for i in range(n):
    X_val_norm[:,i,:] = MinMaxScaler().fit_transform(X_val[:,i,:])
    
# Print the shape of the normalised array
print("X_val_norm: {}".format(np.shape(X_val_norm)))

# Define shape
m, n, T = np.shape(X_test)
X_test_norm = np.zeros((m,n,T))
    
# Iterate through each feature
for i in range(n):
    X_test_norm[:,i,:] = MinMaxScaler().fit_transform(X_test[:,i,:])
    
# Print the shape of the normalised array
print("X_test_norm: {}".format(np.shape(X_test_norm)))

X_train_norm: (30839, 22, 24)
X_val_norm: (3855, 22, 24)
X_test_norm: (3855, 22, 24)


In [11]:
# Save to data.npy
if not os.path.exists('./res'):
    os.makedirs('./res')

tosave = {'X_train': X_train_norm, 'y_train': y_train, 'X_val': X_val_norm, 'y_val': y_val, 'X_test': X_test_norm, 'y_test': y_test}
np.save('res/data.npy',tosave)
print("Saved!")

Saved!
