In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210204_singleICUSTAY_final.pkl')

In [3]:
# import patient info
data_dir = 'data/physionet.org/files/mimiciii/1.4/'
notes_file = 'NOTEEVENTS.csv'

In [4]:
skiplist = []

row_count = 2083181
skiprows = 0
nrows = 100000  # defualt

colnames = np.array(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
           'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION',
           'CGID', 'ISERROR','TEXT'
          ])

usecols = colnames[[1,2,4,9,10]]
cnt=0

In [5]:
from customTransformers import ColumnSelectTransformer, DiagnosisFrameTransformer

def preprocess_text(df):
    # This function preprocesses the text by filling not a number 
    # and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    
    cst = ColumnSelectTransformer(['TEXT'])
    dft = DiagnosisFrameTransformer()

    text = cst.fit_transform(df)
    df['TEXT'] = dft.fit_transform(text)
    
    return df

In [6]:
all_colnames = ['CHARTTIME', 'TEXT', 'SUBJECT_ID', 'GENDER', 'HADM_ID', 'ADMITTIME',
       'DISCHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE',
       'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'HOSPITAL_DAYS',
       'ADMIT_AGE', 'ICUSTAY_ID', 'DBSOURCE', 'INTIME', 'LOS',
       'DAYS_ADM_TO_ICU', 'SAMEDAY_ADM_TO_ICU', 'ADM_TO_ICU_100p',
       'ADM_TO_ICU_90m', 'ICU_URGENCY', 'DAYS_NOTE_TO_ICU']

cst = ColumnSelectTransformer(['TEXT'])
dft = DiagnosisFrameTransformer()

df_all = pd.DataFrame(columns = all_colnames)
while skiprows<row_count:
    print('Iteration {}...'.format(cnt))
    
    if skiprows + nrows > row_count:
        nrows = row_count - skiprows
    else:
        nrows = 100000
            
    df_note = pd.read_csv(data_dir + notes_file, sep=',', header=0, names=colnames,
                     skiprows=skiprows, nrows=nrows, usecols=usecols)
    
    # drop charttime=na
    df_note = df_note.dropna(subset=['CHARTTIME'])
    
    # drop note errors
    df_note = df_note[df_note['ISERROR']!=1] # 1 is error
    df_note = df_note.drop(columns=['ISERROR'])
    
    # keep only rows that have matching ['SUBJECT_ID', 'HADM_ID'] in df_ids
    df_note = df_note.merge(df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
    df_note = df_note.drop_duplicates()
    
    # convert charttime to datetime
    df_note.CHARTTIME = pd.to_datetime(df_note.CHARTTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
    
    # calculate days from note event to icu admission
    df_note['DAYS_NOTE_TO_ICU'] = (df_note['INTIME'] - df_note['CHARTTIME']).dt.total_seconds()/(24*60*60)
    
    # keep only chartevents that occurred before ICU INTIME (>0)
    df_note = df_note[df_note['DAYS_NOTE_TO_ICU']>0]
    
    if df_note.shape[0]:
        # clean up TEXT (similar to DIAGNOSES) 
        df_note = preprocess_text(df_note)

        df_all = df_all.append(df_note)
    
    skiprows += nrows
    cnt += 1

Iteration 0...
Iteration 1...
Iteration 2...
Iteration 3...
Iteration 4...
Iteration 5...
Iteration 6...
Iteration 7...
Iteration 8...
Iteration 9...
Iteration 10...
Iteration 11...
Iteration 12...
Iteration 13...
Iteration 14...
Iteration 15...
Iteration 16...
Iteration 17...
Iteration 18...
Iteration 19...
Iteration 20...


In [7]:
from datetime import timedelta
def get_earliest_time_row(x):
    minrow = x.loc[x['CHARTTIME'].idxmin()]
    # this will force empty TEXTs to be at top of groupby (for @ inital admit)
    # without affecting times
    minrow['ADMITTIME'] = minrow['ADMITTIME'] - timedelta(seconds=1)
    return minrow

In [8]:
def add_initial_admit_row(df):
    df_gb_early = df.groupby(['SUBJECT_ID', 'HADM_ID']).apply(get_earliest_time_row)
    df_gb_early.TEXT = ' '
    df = df.append(df_gb_early)
    df = df.sort_values(by=['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'CHARTTIME'])
    df = df.reset_index(drop=True)
    return df

In [9]:
# 2. order rows by SUBJECT_ID, HADM_ID, ADMITTIME, then CHARTTIME
df_all = df_all.sort_values(by=['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'CHARTTIME'])
df_all = df_all.reset_index(drop=True)

# 3. for each SUBJECT_ID and HADM_ID, find first row of HADM_ID and create 
# new row with empty TEXT (indicating info received immediately after first admission)
df_all = add_initial_admit_row(df_all)

In [25]:
# 4. for every row after first for each  SUBJECT_ID (across all HADM_IDs): 
#    just concatenate all subsequent note events for by SUBJECT_ID (this serves to extend 
#    running tally of notes for each subject across hospital admits)
def extend_text_set(x):
    for ix in range(1,len(x)):
        x['TEXT'].iloc[ix] = x['TEXT'].iloc[ix-1] + x['TEXT'].iloc[ix]
    return x

df_all = df_all.groupby(['SUBJECT_ID']).apply(extend_text_set)

In [28]:
# 5. create new DAYS_EVENT_TO_ICU that collapses DAYS_ADM_TO_ICU (for admit info only) and DAYS_NOTE_TO_ICU (for all notes)
def fill_event_to_icu_col(x):
#     print(x)
    if x['TEXT'] == ' ':
        return x['DAYS_ADM_TO_ICU']
    return x['DAYS_NOTE_TO_ICU']

df_all['DAYS_EVENT_TO_ICU'] = df_all.apply(fill_event_to_icu_col,axis=1)

In [30]:
df_all.columns

Index(['CHARTTIME', 'TEXT', 'SUBJECT_ID', 'GENDER', 'HADM_ID', 'ADMITTIME',
       'DISCHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE',
       'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'HOSPITAL_DAYS',
       'ADMIT_AGE', 'ICUSTAY_ID', 'DBSOURCE', 'INTIME', 'LOS',
       'DAYS_ADM_TO_ICU', 'SAMEDAY_ADM_TO_ICU', 'ADM_TO_ICU_100p',
       'ADM_TO_ICU_90m', 'ICU_URGENCY', 'DAYS_NOTE_TO_ICU',
       'DAYS_EVENT_TO_ICU'],
      dtype='object')

In [31]:
df_all=df_all.drop(['DISCHTIME','HOSPITAL_EXPIRE_FLAG',
                    'HAS_CHARTEVENTS_DATA','HOSPITAL_DAYS',
                    'DAYS_ADM_TO_ICU','SAMEDAY_ADM_TO_ICU',
                   'ADM_TO_ICU_100p','ADM_TO_ICU_90m',
                   'DAYS_NOTE_TO_ICU'],axis=1)

In [32]:
df_all = df_all[['SUBJECT_ID','HADM_ID','ICUSTAY_ID','ADMITTIME',
                 'CHARTTIME','INTIME','GENDER','ADMIT_AGE',
                'ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE',
                'LANGUAGE','RELIGION','MARITAL_STATUS','ETHNICITY',
                'DIAGNOSIS','TEXT','DBSOURCE','LOS','DAYS_EVENT_TO_ICU',
                'ICU_URGENCY']]

In [33]:
df_all.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,ADMISSION_LOCATION,...,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY
0,3,145834.0,211552,2101-10-20 19:07:59,2101-10-20 17:49:00,2101-10-20 19:10:11,M,76,EMERGENCY,EMERGENCY ROOM ADMIT,...,***,CATHOLIC,MARRIED,WHITE,HYPOTENSION,,carevue,6.0646,0.001516,immediate
1,3,145834.0,211552,2101-10-20 19:08:00,2101-10-20 17:49:00,2101-10-20 19:10:11,M,76,EMERGENCY,EMERGENCY ROOM ADMIT,...,***,CATHOLIC,MARRIED,WHITE,HYPOTENSION,ct abdomen contrast ct pelvis contrast clip ...,carevue,6.0646,0.056377,immediate
2,3,145834.0,211552,2101-10-20 19:08:00,2101-10-20 18:16:00,2101-10-20 19:10:11,M,76,EMERGENCY,EMERGENCY ROOM ADMIT,...,***,CATHOLIC,MARRIED,WHITE,HYPOTENSION,ct abdomen contrast ct pelvis contrast clip ...,carevue,6.0646,0.037627,immediate
3,4,185777.0,294638,2191-03-16 00:27:59,2191-03-15 16:20:00,2191-03-16 00:29:31,F,48,EMERGENCY,EMERGENCY ROOM ADMIT,...,***,PROTESTANT QUAKER,SINGLE,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",,carevue,1.6785,0.001053,immediate
4,4,185777.0,294638,2191-03-16 00:28:00,2191-03-15 16:20:00,2191-03-16 00:29:31,F,48,EMERGENCY,EMERGENCY ROOM ADMIT,...,***,PROTESTANT QUAKER,SINGLE,WHITE,"FEVER,DEHYDRATION,FAILURE TO THRIVE",chest portable ap clip clip number radiology...,carevue,1.6785,0.339942,immediate


In [34]:
# categorical outcome variables for the time from hospital event to icu stay
# admits <24 hrs
df_all['SAMEDAY_EVENT_TO_ICU'] = df_all['DAYS_EVENT_TO_ICU'].apply(lambda x: int(x<=1))

In [35]:
import numpy as np

# create 4-category urgency bins
def get_time_bin_cats(val,edges=np.array([1/24, 1, 5, 300]),labels=['immediate','urgent','questionable','stable']):
    time_bin = np.where(val<=edges)[0][0]
    
    return labels[time_bin]

# admit times in urgency bins
df_all['ICU_URGENCY'] = df_all['DAYS_EVENT_TO_ICU'].apply(get_time_bin_cats)

In [39]:
df_all.shape

(108178, 22)

In [41]:
from sklearn.impute import SimpleImputer
simp = SimpleImputer(strategy='median')
df_all[['LOS']] = simp.fit(df_all[['LOS']]).transform(df_all[['LOS']])

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                    df_all,
                                    df_all['ICU_URGENCY'],
                                    random_state=4, test_size=0.2,
                                    stratify=df_all['ICU_URGENCY'])

In [43]:
df_all.to_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_final.pkl')

In [44]:
import pickle

file = './data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TRAIN_final.pkl'
pickle.dump((X_train,y_train),open(file,'wb'))

file = './data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TEST_final.pkl'
pickle.dump((X_test,y_test),open(file,'wb'))