In [1]:
print(__doc__)

# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# File specific imports
import datetime
from dateutil.relativedelta import relativedelta

# To make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

Automatically created module for IPython interactive environment


In [2]:
csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/ADMISSIONS.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_LOCATION', 'ETHNICITY', 'DIAGNOSIS']
admissions = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/PATIENTS.csv'
cols_to_keep = ['SUBJECT_ID', 'GENDER', 'DOB', 'EXPIRE_FLAG']
patients = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/INPUTEVENTS_CV.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'AMOUNT', 'AMOUNTUOM'] # 'RATE', 'RATEUOM', 'STOPPED' are full of NaN
inputevents_cv = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/INPUTEVENTS_MV.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'ENDTIME', 'ITEMID', 'AMOUNT', 'AMOUNTUOM', 'RATE', 'RATEUOM', 'ORDERCATEGORYNAME', 'ORDERCATEGORYDESCRIPTION']
inputevents_mv = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/LABEVENTS.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM', 'FLAG'] # 'HADM_ID' is full of NaN
labevents = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/D_ITEMS.csv'
cols_to_keep = ['ITEMID', 'LABEL'] #'CATEGORY', 'UNITNAME', 'PARAM_TYPE' are full of NaN
d_items = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/D_LABITEMS.csv'
cols_to_keep = ['ITEMID', 'LABEL', 'FLUID', 'CATEGORY']
d_labitems = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
admissions = admissions.applymap(str)
patients = patients.applymap(str)
inputevents_cv = inputevents_cv.applymap(str)
inputevents_mv  = inputevents_mv .applymap(str)
labevents  = labevents.applymap(str) 
d_items = d_items.applymap(str)
d_labitems = d_labitems.applymap(str)

# PATIENT INFORMATION

In [4]:
# Combine admissions and patients
patient_info = pd.merge(admissions, patients, on = ['SUBJECT_ID'])

# Change Ethnicity to White/Non-White
ethnicity = patient_info['ETHNICITY']

def get_ethnicity(value):
    if value == 'WHITE':
        return 'WHITE'
    else:
        return 'NOT WHITE'

list_ethnicity = [get_ethnicity(value) for value in ethnicity]

patient_info = patient_info.drop('ETHNICITY', axis = 1)
df_ethnicity = pd.DataFrame({'ETHNICITY' : list_ethnicity})
patient_info = pd.concat([patient_info, df_ethnicity], axis=1)

# Add ages
DoB = patient_info['DOB']
admDate = patient_info['ADMITTIME']

def get_age (birth, current):
    #replace dashes and colons with spaces to make breaking the string up easier
    birth = birth.replace('-', ' ')
    birth = birth.replace(':', ' ')
    current = current.replace('-', ' ')
    current = current.replace(':', ' ')

    #split into a string outputting [year, month, day, hour, minutes, seconds]
    birth = birth.split(' ')
    current = current.split(' ')

    #convert to integers
    birth = [int(i) for i in birth]
    current = [int(i) for i in current]
    
    #get age and return it
    birth = datetime.datetime(birth[0], birth[1], birth[2], birth[3], birth[4], birth[5])
    current = datetime.datetime(current[0], current[1], current[2], current[3], current[4], current[5])
    age = relativedelta(current, birth).years
    
    #adjust for 89+ category
    if age == 300:
        age = 89
    
    return age

def get_age_category(age):
    if age > 88:
        return ('89+')
    elif age > 60:
        return('Elderly')
    elif age > 50:
        return('50s')
    elif age > 40:
        return('40s')
    elif age > 30:
        return('30s')
    elif age > 17:
        return('20s')
    else:
        return('Child')

ages = []
age_categories = []

for i in range(len(DoB)):
    ages.append(get_age(DoB[i], admDate[i]))

for age in ages:
    age_categories.append(get_age_category(age))

ages_df = pd.DataFrame({'AGE' : ages})
ages_cat_df = pd.DataFrame({'AGE_CAT' : age_categories})
patient_info = pd.concat([patient_info, ages_df], axis=1)
patient_info = pd.concat([patient_info, ages_cat_df], axis=1)

# Add LOS
admDate = patient_info['ADMITTIME']
dischDate = patient_info['DISCHTIME']

def get_los (admit, disch):
    #replace dashes and colons with spaces to make breaking the string up easier
    admit = admit.replace('-', ' ')
    admit = admit.replace(':', ' ')
    disch = disch.replace('-', ' ')
    disch = disch.replace(':', ' ')

    #split into a string outputting [year, month, day, hour, minutes, seconds]
    admit = admit.split(' ')
    disch = disch.split(' ')

    #convert to integers
    admit = [int(i) for i in admit]
    disch = [int(i) for i in disch]
    
    #get los and return it
    admit = datetime.datetime(admit[0], admit[1], admit[2], admit[3], admit[4], admit[5])
    disch = datetime.datetime(disch[0], disch[1], disch[2], disch[3], disch[4], disch[5])
    los = (disch-admit).total_seconds()/3600
    los = round((los/24), 1)

    return los

los = []

for i in range(len(admDate)):
    los.append(get_los(admDate[i], dischDate[i]))
    
los_df = pd.DataFrame({'LOS' : los})
patient_info = pd.concat([patient_info, los_df], axis = 1)

# Get Day/Night (day is between 8 am : 8 pm)
admTime = patient_info['ADMITTIME']

def get_day_night (value):
    #replace dashes and colons with spaces to make breaking the string up easier
    value = value.replace('-', ' ')
    value = value.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    date = value.split(' ')
    
    #leave hour/min/sec, convert to integers
    hour = date[3]
    int_hour = int(hour)
    
    #get day/night and return it
    if 7 < int_hour < 21:
        return 'DAY'
    else:
        return 'NIGHT'

day_night = [get_day_night(value) for value in admTime]

day_night_df = pd.DataFrame({'DAY_NIGHT' : day_night})
patient_info = pd.concat([patient_info, day_night_df], axis=1)

# Get admission day
admTime = patient_info['ADMITTIME']

def get_day_of_week (value):
    date = []
    int_date = []
    day_of_week = []
    
    #replace dashes and colons with spaces to make breaking the string up easier
    value = value.replace('-', ' ')
    value = value.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    date = value.split(' ')
    
    #delete hour/min/sec, convert to integers
    del date[3:6]
    int_date = [int(i) for i in date]
    
    #get day of week and return it
    day_of_week = datetime.date(int_date[0], int_date[1], int_date[2]).weekday()
    
    return day_of_week


admTime_weekday = [get_day_of_week(value) for value in admTime]

weekday_data_df = pd.DataFrame({'ADM_DAY' : admTime_weekday})
patient_info = pd.concat([patient_info, weekday_data_df], axis=1)

# Clean up columns
patient_times_info = patient_info[['SUBJECT_ID', 'ADMITTIME', 'DISCHTIME', 'DOB']].copy()
patient_times_info = patient_times_info.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')

patient_info = patient_info.drop(['ADMITTIME', 'DISCHTIME', 'DOB'], axis = 1)

LABEVENTS

In [5]:
# Convert labevents[FLAG] : NaN to 0, abnomal to 1
labevents['FLAG'] = labevents['FLAG'].replace('nan', 0)
labevents['FLAG'] = labevents['FLAG'].replace('abnormal', 1)

# Merge d_labitems onto labevents 
labevents = pd.merge(labevents, d_labitems, on = 'ITEMID')

INPUTEVENTS (CV AND MV)

In [6]:
# Merge d_items onto inputevents_cv, and inputevents_mv separately
inputevents_cv = pd.merge(inputevents_cv, d_items, on = 'ITEMID')
inputevents_mv = pd.merge(inputevents_mv, d_items, on = 'ITEMID')

# Heparin Data Organization

In [7]:
patient_info.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMISSION_LOCATION,DIAGNOSIS,GENDER,EXPIRE_FLAG,ETHNICITY,AGE,AGE_CAT,LOS,DAY_NIGHT,ADM_DAY
0,22,165315,EMERGENCY ROOM ADMIT,BENZODIAZEPINE OVERDOSE,F,0,WHITE,64,Elderly,1.1,DAY,5
1,23,152223,PHYS REFERRAL/NORMAL DELI,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,M,0,WHITE,71,Elderly,5.5,NIGHT,0
2,23,124321,TRANSFER FROM HOSP/EXTRAM,BRAIN MASS,M,0,WHITE,75,Elderly,6.8,DAY,1
3,24,161859,TRANSFER FROM HOSP/EXTRAM,INTERIOR MYOCARDIAL INFARCTION,M,0,WHITE,39,30s,2.9,DAY,5
4,25,129635,EMERGENCY ROOM ADMIT,ACUTE CORONARY SYNDROME,M,0,WHITE,58,50s,3.5,NIGHT,6


1\. UNIQUE ADULTS

In [8]:
# Get only first ICU stay for each patient
patient_info = patient_info.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')

# Find all Heparin patients
word_list = ['Heparin']
heparin_cv = inputevents_cv.loc[inputevents_cv['LABEL'].isin(word_list)]
heparin_mv = inputevents_mv[inputevents_mv['LABEL'].str.contains('|'.join(word_list))]

# Drop Prophylaxis from heparin_mv
heparin_mv = heparin_mv[heparin_mv['LABEL'] != 'Heparin Sodium (Prophylaxis)']

In [9]:
# Count total patients recieving UFH
gp_cv = heparin_cv['SUBJECT_ID'].unique().tolist()
gp_mv = heparin_mv['SUBJECT_ID'].unique().tolist()
patient_count = len(gp_cv) + len(gp_mv)
print('Total PT recieving UFH: ', patient_count)

Total PT recieving UFH:  8095


In [10]:
# Update patient_info to heparin_patients
patient_list = gp_cv + gp_mv

heparin_patients = pd.DataFrame({'SUBJECT_ID' : patient_list})
heparin_patients = heparin_patients.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')
heparin_patients = pd.merge(heparin_patients, patient_info, on = 'SUBJECT_ID')
heparin_patients.shape

(7974, 12)

2\. aPTT MEASUREMENT

In [12]:
# Find all aPTT measurements
word_list = ['PTT']
aPTT = labevents[labevents['LABEL'].str.contains('|'.join(word_list))]
aPTT = aPTT.reset_index(drop = True)
aPTT['HADM_ID'] = aPTT['HADM_ID'].str.replace('.0', '')

# Remove repeat HADM_ID
word_list = heparin_patients['HADM_ID'].tolist()
aPTT = aPTT.loc[aPTT['HADM_ID'].isin(word_list)]

# Create Groups
gp_ptt = aPTT['SUBJECT_ID'].unique().tolist()
patient_count = len(gp_ptt)

print('Total PT with aPTT tests: ', patient_count)

Total PT with aPTT tests:  4195


In [13]:
# Update heparin_patients
patient_list_ptt = gp_ptt
aPTT_patients = pd.DataFrame({'SUBJECT_ID' : patient_list_ptt})
heparin_patients = pd.merge(heparin_patients, aPTT_patients, on = 'SUBJECT_ID')
heparin_patients.shape

(4195, 12)

In [14]:
ptts = aPTT['VALUE']

def get_aPTT(ptt):
    if ptt == '>150':
        return ('150.0')
    elif ptt == '>150.0':
        return ('150.0')
    elif ptt == '> 150':
        return ('150.0')
    elif len(ptt) > 5:
        return (None)
    elif ptt == 'ERROR':
        return (None)
    ptt = ptt.replace('..', '.')
    return(ptt)

def get_aPTT_category(ptt):
    if ptt == None:
        return (None)
    ptt = float(ptt)
    if ptt > 100:
        return ('SUPRA-TH')
    elif ptt < 60:
        return('SUB-TH')
    else:
        return('TH')

ptts_new = []
ptt_categories = []

for ptt in ptts:
    ptts_new.append(get_aPTT(ptt))

for ptt in ptts_new:
    ptt_categories.append(get_aPTT_category(ptt))

ptt_df = pd.DataFrame({'aPTT' : ptts_new})
ptt_cat_df = pd.DataFrame({'aPTT_CAT' : ptt_categories})

aPTT = aPTT.reset_index(drop = True)
aPTT = pd.concat([aPTT, ptt_df], axis = 1)
aPTT = pd.concat([aPTT, ptt_cat_df], axis = 1)
aPTT = aPTT[['SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'aPTT', 'VALUEUOM', 'LABEL', 'aPTT_CAT', 'FLUID', 'CATEGORY']].copy()

3\. NON TRANSFERS

In [18]:
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM SKILLED NUR']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM HOSP/EXTRAM']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM OTHER HEALT']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != '** INFO NOT AVAILABLE **']

heparin_patients.shape

(2914, 12)

4\. FEATURES

Dosage Start Time

In [34]:
# Update heparin dosage dataframes
word_list = heparin_patients['HADM_ID'].tolist()
heparin_cv['HADM_ID'] = heparin_cv['HADM_ID'].str.replace('.0', '')
heparin_cv = heparin_cv.loc[heparin_cv['HADM_ID'].isin(word_list)]
heparin_mv = heparin_mv.loc[heparin_mv['HADM_ID'].isin(word_list)]

# Extract IDs and Start Times
dose_cv = heparin_cv[['SUBJECT_ID', 'CHARTTIME']].copy()
dose_cv = dose_cv.rename(columns = {'CHARTTIME': 'DOSE_TIME'})

dose_mv = heparin_mv[['SUBJECT_ID', 'STARTTIME']].copy()
dose_mv = dose_mv.rename(columns = {'STARTTIME': 'DOSE_TIME'})

# Combine and take First Dose
dose_info = pd.concat([dose_cv, dose_mv])
first_dose = dose_info.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')

#Update patient numbers
x = heparin_cv['SUBJECT_ID'].unique().tolist()
y = heparin_mv['SUBJECT_ID'].unique().tolist()
z = x+y
z_df = pd.DataFrame({'SUBJECT_ID' : z})
z_df = z_df.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')
heparin_patients = pd.merge(heparin_patients, z_df, on = 'SUBJECT_ID')
print(heparin_patients.shape)
print ('Total patients with first dose: ', len(z))

(2440, 12)
Total patients with first dose:  2440


MEASUREMENT TIME (dose to aPTT)

In [35]:
aPTT = aPTT.rename(columns = {'CHARTTIME': 'aPTT_TIME'})
measure_time = aPTT[['SUBJECT_ID', 'aPTT_TIME']].copy()
measure_time = pd.merge(measure_time, first_dose, on = 'SUBJECT_ID')

doseDate = measure_time['DOSE_TIME']
measureDate = measure_time['aPTT_TIME']

def get_length (first, second):
    #replace dashes and colons with spaces to make breaking the string up easier
    first = first.replace('-', ' ')
    first = first.replace(':', ' ')
    second = second.replace('-', ' ')
    second = second.replace(':', ' ')

    #split into a string outputting [year, month, day, hour, minutes, seconds]
    first = first.split(' ')
    second = second.split(' ')

    #convert to integers
    first = [int(i) for i in first]
    second = [int(i) for i in second]
    
    #get los and return it
    first = datetime.datetime(first[0], first[1], first[2], first[3], first[4], first[5])
    second = datetime.datetime(second[0], second[1], second[2], second[3], second[4], second[5])
    length = (second-first).total_seconds()/3600
    length = round(length, 1)

    return length

measure_list = []

for i in range(len(doseDate)):
    measure_list.append(get_length(doseDate[i], measureDate[i]))
    
measure_df = pd.DataFrame({'MEASURE_TIME' : measure_list})
measure_time = pd.concat([measure_time, measure_df], axis = 1)
measure_time.head()

Unnamed: 0,SUBJECT_ID,aPTT_TIME,DOSE_TIME,MEASURE_TIME
0,3,2101-10-22 04:30:00,2101-10-23 08:00:00,-27.5
1,3,2101-10-22 12:45:00,2101-10-23 08:00:00,-19.2
2,3,2101-10-22 21:15:00,2101-10-23 08:00:00,-10.8
3,3,2101-10-23 03:45:00,2101-10-23 08:00:00,-4.2
4,3,2101-10-23 10:10:00,2101-10-23 08:00:00,2.2


DOSAGE TIME (adm to dose)

In [36]:
adm = patient_times_info[['SUBJECT_ID', 'ADMITTIME']].copy()
dosage_info = pd.merge(adm, first_dose, on = 'SUBJECT_ID')

admDate = dosage_info['ADMITTIME']
doseDate = dosage_info['DOSE_TIME']

dosage_list = []

for i in range(len(admDate)):
    dosage_list.append(get_length(admDate[i], doseDate[i]))
    
dosage_df = pd.DataFrame({'DOSAGE_TIME' : dosage_list})
dosage_time = pd.concat([dosage_info, dosage_df], axis = 1)
dosage_time.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DOSE_TIME,DOSAGE_TIME
0,25,2160-11-02 02:06:00,2160-11-03 10:00:00,31.9
1,107,2115-02-20 17:41:00,2115-02-20 20:00:00,2.3
2,130,2119-10-29 14:49:00,2119-10-30 23:00:00,32.2
3,154,2117-12-29 21:36:00,2117-12-29 22:00:00,0.4
4,111,2142-04-24 06:55:00,2142-04-24 21:00:00,14.1


COMBINE