In [1]:
# Import libraries
import psycopg2
import getpass
import matplotlib.pyplot as plt
import numpy as np
import os.path
from sklearn.model_selection import train_test_split

In [2]:
# Define feature file names
feature_files = ['admission_type', 'age', 'aids_haem_mets', 'bicarb', 'bilirubin', 'blood_pressure', 'fio2', \
            'gcs_eyes', 'gcs_motor', 'gcs_verbal', 'heart_rate', 'pao2', 'potassium', 'sodium', 'temperature', 'urea', 'urine', 'wbc']

# Define processed feature names
feature_names = ['admission_type', 'age', 'aids_haem_mets', 'bicarb_24h', 'bilirubin_24h', 'bp_24h', 'fio2_24h', \
            'gcs_eyes_24h', 'gcs_motor_24h', 'gcs_verbal_24h', 'hr_24h', 'pao2_24h', 'potassium_24h', 'sodium_24h', 'temp_24h', 'urea_24h', 'urine_24h', 'wbc_24h']

In [None]:
# Load the data
for i in range(len(feature_files)):
    _data = np.load('res/{}.npy'.format(feature_files[i]), allow_pickle=True).tolist()
    exec("{} = _data['{}']".format(feature_names[i], feature_names[i]))
print("All files loaded!")

In [None]:
# Check shape of files
for i in range(len(feature_files)):
    print("{}: ".format(feature_names[i]))
    exec("shape = np.shape({})\nprint(shape)".format(feature_names[i]))
    print("")

In [None]:
# Broadcast static features so they have 24 time steps
sched_surg_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,1], (24,1)).T))
unsched_surg_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,2], (24,1)).T))
medical_24h = np.column_stack((admission_type[:,0], np.tile(admission_type[:,3], (24,1)).T))
age_24h = np.column_stack((age[:,0], np.tile(age[:,1], (24,1)).T))
aids_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,1], (24,1)).T))
haem_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,2], (24,1)).T))
mets_24h = np.column_stack((aids_haem_mets[:,0], np.tile(aids_haem_mets[:,3], (24,1)).T))

# Check the dimensions are correct
print("sched_surg_24h: ", np.shape(sched_surg_24h))
print("unsched_surg_24h: ", np.shape(unsched_surg_24h))
print("medical_24h: ", np.shape(medical_24h))
print("age_24h: ", np.shape(age_24h))
print("aids_24h: ", np.shape(aids_24h))
print("haem_24h: ", np.shape(haem_24h))
print("mets_24h: ", np.shape(mets_24h))

In [None]:
# Stack all features into one array
dataset = np.dstack((sched_surg_24h, unsched_surg_24h, medical_24h, age_24h, aids_24h, haem_24h, mets_24h, bicarb_24h, bilirubin_24h, bp_24h, \
                   fio2_24h, gcs_eyes_24h, gcs_motor_24h, gcs_verbal_24h, hr_24h, pao2_24h, potassium_24h, sodium_24h, temp_24h, urea_24h, urine_24h, wbc_24h))

# Remove the patient IDs
dataset = dataset[:,1:,:]

# Transpose dataset so it is in the shape (m,n,T)
dataset = np.transpose(dataset, (0,2,1))

# Check the shape of the dataset
print("dataset: ", np.shape(dataset))

In [None]:
# Check the data to make sure there's nothing too crazy going on
feat_names = ["sched_surg_24h", "unsched_surg_24h", "medical_24h", "age_24h", "aids_24h", "haem_24h", "mets_24h", "bicarb_24h", "bilirubin_24h", "bp_24h", \
            "fio2_24h", "gcs_eyes_24h", "gcs_motor_24h", "gcs_verbal_24h", "hr_24h", "pao2_24h", "potassium_24h", "sodium_24h", "temp_24h", "urea_24h", "urine_24h", "wbc_24h"]

n = len(feat_names)
for i in range(n):
    feature = dataset[:,i,:]
    print("{}:".format(feat_names[i]))
    print("Min = {}; Max = {}; Mean = {}; Standard Deviation = {}".format(round(np.min(feature),2), round(np.max(feature),2), round(np.mean(feature),2), round(np.std(feature),2)))
    print("")

In [None]:
# As seen above, some features have values below zero and need to be clipped
clipped_dataset = np.clip(dataset, 0, a_max=None)

In [None]:
# Load the mortality data
_data = np.load('res/mortality.npy', allow_pickle=True).tolist()
mortality = _data['mortality']

# Remove the patient IDs
mortality = mortality[:,1]

# Check the shape of the mortality
print("mortality: ", np.shape(mortality))

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(clipped_dataset, mortality, train_size=0.8, random_state=42)

# Split the test data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=42)

# Check the shapes of the outputs
print("X_train: {}".format(np.shape(X_train)))
print("y_train: {}".format(np.shape(y_train)))
print("X_val: {}".format(np.shape(X_val)))
print("y_val: {}".format(np.shape(y_val)))
print("X_test: {}".format(np.shape(X_test)))
print("y_test: {}".format(np.shape(y_test)))

In [None]:
# Print information about the created arrays
print("No. of patients: {}".format(len(dataset)))
print("No. of patients in train, val, test sets: {}, {}, {}".format(len(X_train), len(X_val), len(X_test)))
print("No. of patients who died in hospital in train, val, test sets: {}, {}, {}".format(str(sum(y_train)), str(sum(y_val)), str(sum(y_test))))
print("Proportion of patients who died in hospital in train, val, test sets: {}%, {}%, {}%".format(round(sum(y_train)/len(y_train)*100,3), round(sum(y_val)/len(y_val)*100,3), round(sum(y_test)/len(y_test)*100,3)))

In [None]:
# Write a function to normalises the data using the MinMaxScaler, treating each feature separately
def MinMaxScaler_3D(array, minimum=None, maximum=None):
    
    '''
    INPUTS:
    array - a 3D array of shape (m, n, T), where m = number of examples, n = number of features, and T = number of time steps.
    minimum (optional) - the minimum you want to apply for the scaling. If not specified, the minimum will be calculated from the array.
    maximum (optional) - the maximum you want to apply for the scaling. If not specified, the maximum will be calculated from the array.
    -> note that minimum and maximum, if specified, need to be n-length vectors, where each entry represents the min/max for each feature in the array.
    
    OUTPUTS:
    scaled_array - a 3D array where each entry in array has been scaled using the equation y = (x - min) / (max - min), and the min/max has been calculated individually for each feature.
    min_vector (optional) - an n-length vector where each entry is the minimum for each feature. Only returned if minimum is not specified.
    max_vector (optional) - an n-length vector where each entry is the maximum for each feature. Only returned if maximum is not specified.
    '''
    
    # First, get the dimensions of the input array
    m, n, T = np.shape(array)
    
    # Calculate the mininimum of each feature
    if minimum == None:
        min_vector = []
        for i in range(n):
            min_vector.append(np.min(array[:,i,:]))
        no_minmax = True # used for deciding whether to output the min_vector at the end
    else:
        min_vector = minimum
        no_minmax = False
    
    # Calculate the maximum of each feature
    if maximum == None:
        max_vector = []
        for i in range(n):
            max_vector.append(np.max(array[:,i,:]))
    else:
        max_vector = maximum
        
    # Scale each feature using the formula: y = (x - min) / (max - min)
    scaled_array = np.zeros((m,n,T))
    for i in range(n):
        scaled_array[:,i,:] = (array[:,i,:] - min_vector[i]) / (max_vector[i] - min_vector[i])
    
    # Return the outputs
    if no_minmax == True:
        return scaled_array, min_vector, max_vector
    else:
        return scaled_array

In [None]:
# Write a function to normalises the data using the z-score, treating each feature separately
def ZScoreScaler_3D(array, mean=None, std=None):
    
    '''
    INPUTS:
    array - a 3D array of shape (m, n, T), where m = number of examples, n = number of features, and T = number of time steps.
    mean (optional) - the mean you want to apply for the scaling. If not specified, the mean will be calculated from the array.
    std (optional) - the standard deviation you want to apply for the scaling. If not specified, the standard deviation will be calculated from the array.
    -> note that minimum and maximum, if specified, need to be n-length vectors, where each entry represents the min/max for each feature in the array.
    
    OUTPUTS:
    scaled_array - a 3D array where each entry in array has been scaled using the equation y = (x - mean) / std, and the mean/standard deviation has been calculated individually for each feature.
    mean_vector (optional) - an n-length vector where each entry is the mean for each feature. Only returned if mean is not specified.
    std_vector (optional) - an n-length vector where each entry is the standard deviation for each feature. Only returned if std is not specified.
    '''
    
    # First, get the dimensions of the input array
    m, n, T = np.shape(array)
    
    # Calculate the mininimum of each feature
    if mean == None:
        mean_vector = []
        for i in range(n):
            mean_vector.append(np.mean(array[:,i,:]))
        no_mean_std = True # used for deciding whether to output the min_vector at the end
    else:
        mean_vector = mean
        no_mean_std = False
    
    # Calculate the maximum of each feature
    if std == None:
        std_vector = []
        for i in range(n):
            std_vector.append(np.std(array[:,i,:]))
    else:
        std_vector = std
        
    # Scale each feature using the formula: y = (x - mean) / std
    scaled_array = np.zeros((m,n,T))
    for i in range(n):
        scaled_array[:,i,:] = (array[:,i,:] - mean_vector[i]) / std_vector[i]
    
    # Return the outputs
    if no_mean_std == True:
        return scaled_array, mean_vector, std_vector
    else:
        return scaled_array

In [None]:
# Scale the training data, and save the scaler
X_train_norm, mean_vector, std_vector = ZScoreScaler_3D(X_train)

# Use the same scaler to scale the validation and test data
X_val_norm = ZScoreScaler_3D(X_val, mean_vector, std_vector)
X_test_norm = ZScoreScaler_3D(X_test, mean_vector, std_vector)

In [None]:
# Return the shapes, minimums and maximums of the scaled arrays just to make sure everything has worked OK
print("X_train_norm:")
print("Shape: {}, minimum = {}, maximum = {}".format(np.shape(X_train_norm), round(np.min(X_train_norm),2), round(np.max(X_train_norm),2)))
print("")
print("X_val_norm:")
print("Shape: {}, minimum = {}, maximum = {}".format(np.shape(X_val_norm), round(np.min(X_val_norm),2), round(np.max(X_val_norm),2)))
print("")
print("X_test_norm:")
print("Shape: {}, minimum = {}, maximum = {}".format(np.shape(X_test_norm), round(np.min(X_test_norm),2), round(np.max(X_test_norm),2)))
print("")

In [None]:
# Save to data.npy
if not os.path.exists('./res'):
    os.makedirs('./res')

tosave = {'X_train': X_train, 'X_val': X_val, 'X_test': X_test, , ...
         'X_train_norm': X_train_norm, 'X_val_norm': X_val_norm, 'X_test': X_test_norm, ...
         'y_train': y_train, 'y_val': y_val, 'y_test': y_test}
np.save('res/data.npy',tosave)
print("Saved!")