# Create data CSVs for analysis

This notebook creates various CSVs of design matrices which we can later use for analysis. The CSVs created are:

* X_design_matrix_first24hr.csv - first 24 hours of a patient's ICU stay
* X_design_matrix_randomtime.csv - a random time during the patient's ICU stay

In [None]:
from __future__ import print_function 

# Import libraries
import numpy as np
import pandas as pd
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

USE_SQL=1
USE_CSV=0

In [None]:
# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'

Load the data in using either a relational database (`USE_SQL=1`) or CSV files in the local directory (`USE_CSV=1`).

In [None]:
if USE_SQL:
    # Connect to local postgres version of mimic
    con = psycopg2.connect(dbname=dbname, user=sqluser)

    # exclusion criteria:
    #   - less than 16 years old
    #   - stayed in the ICU less than 4 hours
    #   - never have any chartevents data (i.e. likely administrative error)
    
    query = query_schema + 'select * from mp_cohort'
    co_all = pd.read_sql_query(query,con)
    co = co_all.loc[co_all['excluded']==0,:]
    # extract static vars into a separate dataframe
    # static vars contain the following:
    #  - demographics (gender, age)
    #  - hospital admission type
    #  - hospital service
    #  - race
    #  - height, weight, BMI
    #  - outcome
    #  - time of first code status change
    
    query = query_schema + 'select * from mp_static_data'
    df_static = pd.read_sql_query(query, con)

    # get ~5 million rows containing data from errbody
    # this takes a little bit of time to load into memory (~2 minutes)

    # %%time results
    # CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
    # Wall time: 2min 7s

    df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
    df.drop('subject_id',axis=1,inplace=True)
    df.drop('hadm_id',axis=1,inplace=True)
    df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)
    print('Data loaded from database.')
elif USE_CSV:
    co_all = pd.read_csv('df_cohort.csv')
    co = co_all.loc[co_all['excluded']==0,:]
    df = pd.read_csv('df_data.csv')
    df_static = pd.read_csv('df_static_data.csv')
    print('Data loaded from CSV.')
else:
    print('No data loaded in!')
    
co_all.set_index('icustay_id',inplace=True)
co.set_index('icustay_id',inplace=True)
df_static.set_index('icustay_id',inplace=True)

In [None]:
idxRem = np.zeros(co_all.shape[0],dtype=bool)
for c in co_all.columns:
    if 'exclusion_' in c:
        print('{:5g} - {:2.2f}% - {}'.format(co_all[c].sum(),
                                             co_all[c].mean()*100.0,
                                            c))
        
        idxRem[co_all[c].values==1] = True
    

print('{:5g} - {:2.2f}% - {}'.format(np.sum(idxRem),
                                     np.mean(idxRem)*100.0,
                                    'total removed'))

print('{:5g} - {:2.2f}% - {}'.format(np.sum(~idxRem),
                                     np.mean(~idxRem)*100.0,
                                    'final cohort'))

In [None]:
def make_design_matrix(W=8, W_extra=24, K=5, seed=111, censor=True):
    """
    Create a design matrix with:
    - indexed by icustay_id
    - first column is K-fold index
    - second column is outcome
    - remaining columns are features
    """

    # generate k-fold indices
    np.random.seed(seed)
    # get unique subject_id
    sid = np.sort(np.unique(df_static['subject_id'].values))

    # assign k-fold
    idxK_sid = np.random.permutation(sid.shape[0])
    idxK_sid = np.mod(idxK_sid,K)
    
    var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()

    # create window time for each patient
    df_tmp=df_death.copy().merge(df_censor, how='left', left_on='icustay_id', right_on='icustay_id')
    time_dict = mp.generate_times(df_tmp, T=2, seed=111, censor=censor)

    # generate windows
    df_data = mp.get_design_matrix(df, time_dict, W=W, W_extra=W_extra)

    # remove icustay_ids if they were censored (made DNR) before icu admission, or close enough to that
    if censor:
        idx = df_censor.loc[df_censor['censortime_hours']<=0, 'icustay_id']
        print('Removed {} icustay_id as they were censored on/before ICU admission.'.format((idx.shape[0])))
        df_data.drop(idx, axis=0, inplace=True)

    # first, the data from static vars from df_static
    X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)

    # next, add in the outcome: death in hospital
    X = X.merge([['death']], left_index=True, right_index=True)

    # generate K-fold indices
    X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)

    # get indices which map subject_ids in sid to the X dataframe
    idxMap = np.searchsorted(sid, X['subject_id'].values)

    # use these indices to map the k-fold integers
    idxK = idxK_sid[idxMap]

    # add idxK to design matrix
    X['idxK'] = idxK

    return X

In [None]:
X = make_design_matrix(W=8, W_extra=24, K=5, seed=111, censor=True)
X.to_csv('X_design_matrix_first24.csv')

In [None]:
X = make_design_matrix(W=8, W_extra=24, K=5, seed=111, censor=True)
X.to_csv('X_design_matrix_randomtime.csv')