In [1]:
# Import libraries
import numpy as np
import pandas as pd
import psycopg2
import getpass
import time
import os.path
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the database connection details
_data = np.load('data/db_details.npy', allow_pickle=True).tolist()
db_details = _data['db_details']
user = db_details['user']
host = db_details['host']
port = db_details['port']
dbname = db_details['dbname']
schema = db_details['schema']
    
# Connect to the database
conn = psycopg2.connect(dbname=dbname, user=user, host=host, port=port, password=getpass.getpass(prompt='Password:'.format(user)))
cur = conn.cursor()
cur.execute('SET search_path TO {}'.format(schema))
conn.commit()

Password: ··········


In [3]:
# Load the patient data
_data = np.load('data/patients.npy', allow_pickle=True).tolist()
patients = _data['patients']
print("Loaded!")

Loaded!


In [4]:
# Function to query the database for a specific feature
def get_feature(feature_name):
    
    '''
    Queries the MIMIC-III database for the specified feature.
    SQL_queries is a dictionary with feature names as keys, and the corresponding SQL queries as values.
    '''
    
    start = time.time()

    # Load the query
    query = open('queries/{}.sql'.format(feature_name), 'r')

    # Store the result of the query as a pandas dataframe
    result = pd.read_sql_query(query.read(), conn)
    query.close()

    end = time.time()
    
    # Print run time
    print("The '{}' query took {:.2f} seconds".format(feature_name, end-start))
    
    # Return the feature
    return result

In [5]:
# Function that removes missing values
def remove_nan(feature):
    
    '''
    Removes nan values from the feature set
    feature is a pandas DataFrame representing some feature queried from the MIMIC-III database
    '''
    
    # Remove rows where valuenum is not a number
    result = feature.dropna(axis=0, subset=['valuenum'])
    
    # Calculate how many rows were removed
    before, after = len(feature), len(result)
    diff = before - after
    
    # Print how many rows were removed
    print("{} rows were removed ({:.2f}% of all measurements)".format(diff, 100*diff/before))
    
    # Return the cleaned feature dataframe
    return result

In [6]:
# Function that creates a list of numpy arrays with all measurements for each patient sorted by time
def collect_all_measurements(feature, patients):
    '''
    Creates a list of numpy arrays, where each array has the value of measurements and the corresponding times they were taken for each patient.
    feature is a pandas DataFrame representing some feature queried from the MIMIC-III database
    patients is a pandas DataFrame that has been previously loaded, containing patient details for patients of interest
    '''
    
    start = time.time()
    
    # Create an empty list in which to store the data
    results = []
    
    # Extract subject ID, hospital admission time and hospital discharge time
    m = len(patients) # number of patients
    hadm_id = patients['hadm_id']
    admittime = patients['admittime']
    dischtime = patients['dischtime']
    
    # Iterate over every patient
    for i in range(m):
    
        # Define subject ID
        h_id = int(hadm_id[i])
    
        # Select data for this subject
        measurements = feature[feature['hadm_id']==h_id]
        
        # Get hospital admission and discharge times for this subject
        admit = admittime[i]
        disch = dischtime[i]
        
        # Discard measurements before admission and after discharge
        measurements = measurements[measurements['charttime'] > admit] # remove measurements before admission
        measurements = measurements[measurements['charttime'] < disch] # remove measurements after discharge
        
        # Sort the measurements by time
        measurements = measurements.sort_values(by=['charttime'])
        
        # Create a final array for this subject, with one column for datetimes and one column for measurements
        res = np.vstack((np.array([x.to_pydatetime() for x in measurements['charttime']]), np.array(measurements['valuenum']))).T
        
        # Append this subjects' array to the results list (after casting to a numpy array)
        results.append(res)
        
    end = time.time()
    
    # Print run time
    print("It took {} minutes to collect all the measurements for each patient".format(round((end-start)/60,2)))
        
    # Return the results
    return results

In [7]:
# Specify the features
features = [
    'bicarb',
    'bilirubin',
    'bp',
    'fio2',
    'hr',
    'pao2',
    'potassium',
    'resp',
    'sodium',
    'spo2',
    'temp',
    'urea',
    'urine',
    'wbc'
]

# Iterate over every feature
for i in features:
    
    # Define a pre-cursor variable name
    precursor = '_' + i
    
    # Query the database for the feature
    exec(precursor + " = get_feature('{}')".format(i))
    
    # Remove any missing values
    exec(precursor + " = remove_nan({})".format(precursor))
    
    # Collect measurements for every patient, excluding measurements outside the relevant hospital stay
    exec(i + " = collect_all_measurements({}, patients)".format(precursor))
    
    # Print a separator between features
    print('')

The 'bicarb' query took 6.35 seconds
209 rows were removed (0.03% of all measurements)
It took 0.99 minutes to collect all the measurements for each patient

The 'bilirubin' query took 2.57 seconds
29 rows were removed (0.01% of all measurements)
It took 0.87 minutes to collect all the measurements for each patient

The 'bp' query took 26.56 seconds
15168 rows were removed (0.25% of all measurements)
It took 3.78 minutes to collect all the measurements for each patient

The 'fio2' query took 23.48 seconds
10996 rows were removed (0.51% of all measurements)
It took 1.79 minutes to collect all the measurements for each patient

The 'hr' query took 32.26 seconds
1446 rows were removed (0.02% of all measurements)
It took 4.72 minutes to collect all the measurements for each patient

The 'pao2' query took 6.17 seconds
108 rows were removed (0.02% of all measurements)
It took 0.96 minutes to collect all the measurements for each patient

The 'potassium' query took 8.90 seconds
446 rows were 

In [8]:
# Convert Fahrenheit temperatures to Celsius (in temp)
start = time.time()
for i in range(len(temp)): # iterate over all patients
    data = temp[i][:,1] # select the values only
    for k in range(len(data)): # iterate over each value and convert if necessary
        data[k] = (data[k]-32)*5/9 if data[k]>50 else data[k]
end = time.time()

# Print run time
print("It took {} seconds to convert all temperatures to Celsius".format(round((end-start),2)))

It took 0.39 seconds to convert all temperatures to Celsius


In [9]:
# Save to raw_features.npy
if not os.path.exists('./data'):
    os.makedirs('./data')

tosave = {
    '_bicarb': _bicarb, 'bicarb': bicarb,
    '_bilirubin': _bilirubin, 'bilirubin': bilirubin,
    '_bp': _bp, 'bp': bp,
    '_fio2': _fio2, 'fio2': fio2,
    '_hr': _hr, 'hr': hr,
    '_pao2': _pao2, 'pao2': pao2,
    '_potassium': _potassium, 'potassium': potassium,
    '_resp': _resp, 'resp': resp,
    '_sodium': _sodium, 'sodium': sodium,
    '_spo2': _spo2, 'spo2': spo2,
    '_temp': _temp, 'temp': temp,
    '_urea': _urea, 'urea': urea,
    '_urine': _urine, 'urine': urine,
    '_wbc': _wbc, 'wbc': wbc,
}
np.save('data/raw_features.npy',tosave)
print("Saved!")

Saved!
