In [1]:
# Import libraries
import numpy as np
import pandas as pd
import psycopg2
import getpass
import time
import os.path
import matplotlib.pyplot as plt
%matplotlib inline
os.chdir('C:\\Users\\anear\\OneDrive - National University of Ireland, Galway\\PhD\\Research Projects\\Ventilation Project')

In [2]:
# Load the database connection details
_data = np.load('data/db_details.npy', allow_pickle=True).tolist()
db_details = _data['db_details']
user = db_details['user']
host = db_details['host']
port = db_details['port']
dbname = db_details['dbname']
schema = db_details['schema']
    
# Connect to the database
conn = psycopg2.connect(dbname=dbname, user=user, host=host, port=port, password=getpass.getpass(prompt='Password:'.format(user)))
cur = conn.cursor()
cur.execute('SET search_path TO {}'.format(schema))
conn.commit()

Password: ··········


In [3]:
# Load the patient data
_data = np.load('data/final_patients.npy', allow_pickle=True).tolist()
patients = _data['patients']
print("Loaded the patient data!")

Loaded the patient data!


In [4]:
# Function to query the database for a specific feature
def get_feature(feature_name):
    
    '''
    Queries the MIMIC-III database for the specified feature.
    SQL_queries is a dictionary with feature names as keys, and the corresponding SQL queries as values.
    '''
    
    start = time.time()

    # Load the query
    query = open('queries/{}.sql'.format(feature_name), 'r')

    # Store the result of the query as a pandas dataframe
    result = pd.read_sql_query(query.read(), conn)
    query.close()

    end = time.time()
    
    # Print run time
    print("The '{}' query took {:.2f} seconds".format(feature_name, end-start))
    
    # Return the feature
    return result

In [5]:
# Function that removes missing values
def remove_nan(feature):
    
    '''
    Removes nan values from the feature set
    feature is a pandas DataFrame representing some feature queried from the MIMIC-III database
    '''
    
    # Remove rows where valuenum is not a number
    try:
        result = feature.dropna(axis=0, subset=['valuenum'])
    except KeyError:
        result = feature.dropna(axis=0, subset=['amount'])
    
    # Calculate how many rows were removed
    before, after = len(feature), len(result)
    diff = before - after
    
    # Print how many rows were removed
    print("{} rows were removed ({:.2f}% of all measurements)".format(diff, 100*diff/before))
    
    # Return the cleaned feature dataframe
    return result

In [6]:
# Function that creates a list of numpy arrays with all measurements for each patient sorted by time
def collect_all_measurements(feature, patients):
    '''
    Creates a list of numpy arrays, where each array has the value of measurements and the corresponding times they were taken for each patient.
    feature is a pandas DataFrame representing some feature queried from the MIMIC-III database
    patients is a pandas DataFrame that has been previously loaded, containing patient details for patients of interest
    '''
    
    start = time.time()
    
    # Create an empty list in which to store the data
    results = []
    
    # Extract subject ID, hospital admission time and hospital discharge time
    m = len(patients) # number of patients
    hadm_id = np.array(patients['hadm_id'])
    admittime = np.array(patients['admittime'])
    dischtime = np.array(patients['dischtime'])
    
    # Iterate over every patient
    for i in range(m):
    
        # Define subject ID
        h_id = int(hadm_id[i])
    
        # Select data for this subject
        measurements = feature[feature['hadm_id']==h_id]
        
        # Get hospital admission and discharge times for this subject
        admit = admittime[i]
        disch = dischtime[i]
        
        # Discard measurements before admission and after discharge
        if 'charttime' in feature.columns:
            measurements = measurements[measurements['charttime'] > admit] # remove measurements before admission
            measurements = measurements[measurements['charttime'] < disch] # remove measurements after discharge
            measurements = measurements.sort_values(by=['charttime']) # sort the measurements by time
        elif 'starttime' in feature.columns:
            measurements = measurements[measurements['starttime'] > admit] # remove measurements before admission
            measurements = measurements[measurements['starttime'] < disch] # remove measurements after discharge
            measurements = measurements.sort_values(by=['starttime']) # sort the measurements by time
        
        # Create a final array for this subject, with one column for datetimes and one column for measurements
        if 'amount' in feature.columns and 'starttime' in feature.columns:
            res = np.vstack((np.array([x.to_pydatetime() for x in measurements['starttime']]), np.array(measurements['amount']))).T
        elif 'amount' in feature.columns and 'charttime' in feature.columns:
            res = np.vstack((np.array([x.to_pydatetime() for x in measurements['charttime']]), np.array(measurements['amount']))).T
        elif 'valuenum' in feature.columns and 'charttime' in feature.columns:
            res = np.vstack((np.array([x.to_pydatetime() for x in measurements['charttime']]), np.array(measurements['valuenum']))).T
        
        # Append this subjects' array to the results list (after casting to a numpy array)
        results.append(res)
        
    end = time.time()
    
    # Print run time
    print("It took {} minutes to collect all the measurements for each patient".format(round((end-start)/60,2)))
        
    # Return the results
    return results

In [7]:
# Specify the features
features = [
    'fentanyl_cv',
    'fentanyl_mv',
    'ie_ratio',
    'insp_flow',
    'insp_press',
    'peep',
    'propofol_cv',
    'propofol_mv',
    'psv',
    'resp',
    'tidvol_set',
    'tidvol_obs',
    'tidvol_spon'
]

# Iterate over every feature
for i in features:
    
    # Define a pre-cursor variable name
    precursor = '_' + i
    
    # Query the database for the feature
    exec(precursor + " = get_feature('{}')".format(i))
    
    # Remove any missing values
    exec(precursor + " = remove_nan({})".format(precursor))
    
    # Collect measurements for every patient, excluding measurements outside the relevant hospital stay
    exec(i + " = collect_all_measurements({}, patients)".format(precursor))
    
    # Print a separator between features
    print('')

The 'fentanyl_cv' query took 3.13 seconds
444496 rows were removed (56.95% of all measurements)
It took 0.41 minutes to collect all the measurements for each patient

The 'fentanyl_mv' query took 0.60 seconds
0 rows were removed (0.00% of all measurements)
It took 0.39 minutes to collect all the measurements for each patient

The 'ie_ratio' query took 1.01 seconds
0 rows were removed (0.00% of all measurements)
It took 0.45 minutes to collect all the measurements for each patient

The 'insp_flow' query took 7.70 seconds
4326 rows were removed (0.74% of all measurements)
It took 0.53 minutes to collect all the measurements for each patient

The 'insp_press' query took 5.79 seconds
5897 rows were removed (0.85% of all measurements)
It took 0.6 minutes to collect all the measurements for each patient

The 'peep' query took 7.89 seconds
4764 rows were removed (0.52% of all measurements)
It took 0.67 minutes to collect all the measurements for each patient

The 'propofol_cv' query took 5.33

In [8]:
# Save to general_features.npy
if not os.path.exists('./data'):
    os.makedirs('./data')

tosave = {
    '_fentanyl_cv': _fentanyl_cv, 'fentanyl_cv': fentanyl_cv,
    '_fentanyl_mv': _fentanyl_mv, 'fentanyl_mv': fentanyl_mv,
    '_ie_ratio': _ie_ratio, 'ie_ratio': ie_ratio,
    '_insp_flow': _insp_flow, 'insp_flow': insp_flow,
    '_insp_press': _insp_press, 'insp_press': insp_press,
    '_peep': _peep, 'peep': peep,
    '_propofol_cv': _propofol_cv, 'propofol_cv': propofol_cv,
    '_propofol_mv': _propofol_mv, 'propofol_mv': propofol_mv,
    '_psv': _psv, 'psv': psv,
    '_resp': _resp, 'resp': resp,
    '_tidvol_set': _tidvol_set, 'tidvol_set': tidvol_set,
    '_tidvol_obs': _tidvol_obs, 'tidvol_obs': tidvol_obs,
    '_tidvol_spon': _tidvol_spon, 'tidvol_spon': tidvol_spon,
}
np.save('data/ventilator_features.npy',tosave)
print("Saved!")

Saved!
