In [2]:
print(__doc__)

# Common imports
import numpy as np
import numpy.random as rnd
import os
import pandas as pd

# File specific imports
import datetime
from dateutil.relativedelta import relativedelta

# To make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

Automatically created module for IPython interactive environment


In [None]:
csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/ADMISSIONS.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_LOCATION', 'ETHNICITY', 'DIAGNOSIS']
admissions = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/PATIENTS.csv'
cols_to_keep = ['SUBJECT_ID', 'GENDER', 'DOB', 'EXPIRE_FLAG']
patients = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/INPUTEVENTS_CV.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'CHARTTIME', 'ITEMID', 'AMOUNT', 'AMOUNTUOM'] # 'RATE', 'RATEUOM', 'STOPPED' are full of NaN
inputevents_cv = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/INPUTEVENTS_MV.csv'
cols_to_keep = ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'STARTTIME', 'ENDTIME', 'ITEMID', 'AMOUNT', 'AMOUNTUOM', 'RATE', 'RATEUOM', 'ORDERCATEGORYNAME', 'ORDERCATEGORYDESCRIPTION']
inputevents_mv = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/LABEVENTS.csv'
cols_to_keep = ['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM', 'FLAG'] # 'HADM_ID' is full of NaN
labevents = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/D_ITEMS.csv'
cols_to_keep = ['ITEMID', 'LABEL'] #'CATEGORY', 'UNITNAME', 'PARAM_TYPE' are full of NaN
d_items = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

csv_path = '/media/bigdatabuffalo/drive/MIMIC3/CSV/D_LABITEMS.csv'
cols_to_keep = ['ITEMID', 'LABEL', 'FLUID', 'CATEGORY']
d_labitems = pd.read_csv(csv_path, skipinitialspace = True, usecols = cols_to_keep)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
admissions = admissions.applymap(str)
patients = patients.applymap(str)
inputevents_cv = inputevents_cv.applymap(str)
inputevents_mv  = inputevents_mv .applymap(str)
labevents  = labevents.applymap(str) 
d_items = d_items.applymap(str)
d_labitems = d_labitems.applymap(str)

# PATIENT INFORMATION

In [None]:
# Combine admissions and patients
patient_info = pd.merge(admissions, patients, on = ['SUBJECT_ID'])

# Change Ethnicity to White/Non-White
ethnicity = patient_info['ETHNICITY']

def get_ethnicity(value):
    if value == 'WHITE':
        return 'WHITE'
    else:
        return 'NOT WHITE'

list_ethnicity = [get_ethnicity(value) for value in ethnicity]

patient_info = patient_info.drop('ETHNICITY', axis = 1)
df_ethnicity = pd.DataFrame({'ETHNICITY' : list_ethnicity})
patient_info = pd.concat([patient_info, df_ethnicity], axis=1)

# Add ages
DoB = patient_info['DOB']
admDate = patient_info['ADMITTIME']

def get_age (birth, current):
    #replace dashes and colons with spaces to make breaking the string up easier
    birth = birth.replace('-', ' ')
    birth = birth.replace(':', ' ')
    current = current.replace('-', ' ')
    current = current.replace(':', ' ')

    #split into a string outputting [year, month, day, hour, minutes, seconds]
    birth = birth.split(' ')
    current = current.split(' ')

    #convert to integers
    birth = [int(i) for i in birth]
    current = [int(i) for i in current]
    
    #get age and return it
    birth = datetime.datetime(birth[0], birth[1], birth[2], birth[3], birth[4], birth[5])
    current = datetime.datetime(current[0], current[1], current[2], current[3], current[4], current[5])
    age = relativedelta(current, birth).years
    
    #adjust for 89+ category
    if age == 300:
        age = 89
    
    return age

def get_age_category(age):
    if age > 88:
        return ('89+')
    elif age > 60:
        return('Elderly')
    elif age > 50:
        return('50s')
    elif age > 40:
        return('40s')
    elif age > 30:
        return('30s')
    elif age > 17:
        return('20s')
    else:
        return('Child')

ages = []
age_categories = []

for i in range(len(DoB)):
    ages.append(get_age(DoB[i], admDate[i]))

for age in ages:
    age_categories.append(get_age_category(age))

ages_df = pd.DataFrame({'AGE' : ages})
ages_cat_df = pd.DataFrame({'AGE_CAT' : age_categories})
patient_info = pd.concat([patient_info, ages_df], axis=1)
patient_info = pd.concat([patient_info, ages_cat_df], axis=1)

# Add LOS
admDate = patient_info['ADMITTIME']
dischDate = patient_info['DISCHTIME']

def get_los (admit, disch):
    #replace dashes and colons with spaces to make breaking the string up easier
    admit = admit.replace('-', ' ')
    admit = admit.replace(':', ' ')
    disch = disch.replace('-', ' ')
    disch = disch.replace(':', ' ')

    #split into a string outputting [year, month, day, hour, minutes, seconds]
    admit = admit.split(' ')
    disch = disch.split(' ')

    #convert to integers
    admit = [int(i) for i in admit]
    disch = [int(i) for i in disch]
    
    #get los and return it
    admit = datetime.datetime(admit[0], admit[1], admit[2], admit[3], admit[4], admit[5])
    disch = datetime.datetime(disch[0], disch[1], disch[2], disch[3], disch[4], disch[5])
    los = (disch-admit).total_seconds()/3600
    los = round((los/24), 1)

    return los

los = []

for i in range(len(admDate)):
    los.append(get_los(admDate[i], dischDate[i]))
    
los_df = pd.DataFrame({'LOS' : los})
patient_info = pd.concat([patient_info, los_df], axis = 1)

# Get Day/Night (day is between 8 am : 8 pm)
admTime = patient_info['ADMITTIME']

def get_day_night (value):
    #replace dashes and colons with spaces to make breaking the string up easier
    value = value.replace('-', ' ')
    value = value.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    date = value.split(' ')
    
    #leave hour/min/sec, convert to integers
    hour = date[3]
    int_hour = int(hour)
    
    #get day/night and return it
    if 7 < int_hour < 21:
        return 'DAY'
    else:
        return 'NIGHT'

day_night = [get_day_night(value) for value in admTime]

day_night_df = pd.DataFrame({'DAY_NIGHT' : day_night})
patient_info = pd.concat([patient_info, day_night_df], axis=1)

# Get admission day
admTime = patient_info['ADMITTIME']

def get_day_of_week (value):
    date = []
    int_date = []
    day_of_week = []
    
    #replace dashes and colons with spaces to make breaking the string up easier
    value = value.replace('-', ' ')
    value = value.replace(':', ' ')
    
    #split into a string outputting [year, month, day, hour, minutes, seconds]
    date = value.split(' ')
    
    #delete hour/min/sec, convert to integers
    del date[3:6]
    int_date = [int(i) for i in date]
    
    #get day of week and return it
    day_of_week = datetime.date(int_date[0], int_date[1], int_date[2]).weekday()
    
    return day_of_week


admTime_weekday = [get_day_of_week(value) for value in admTime]

weekday_data_df = pd.DataFrame({'ADM_DAY' : admTime_weekday})
patient_info = pd.concat([patient_info, weekday_data_df], axis=1)

# Clean up columns
patient_times_info = patient_into[['ADMITTIME', 'DISCHTIME', 'DOB']].copy()
patient_info = patient_info.drop(['ADMITTIME', 'DISCHTIME', 'DOB'], axis = 1)

LABEVENTS

In [None]:
# Convert labevents[FLAG] : NaN to 0, abnomal to 1
labevents['FLAG'] = labevents['FLAG'].replace('nan', 0)
labevents['FLAG'] = labevents['FLAG'].replace('abnormal', 1)

# Merge d_labitems onto labevents 
labevents = pd.merge(labevents, d_labitems, on = 'ITEMID')

INPUTEVENTS (CV AND MV)

In [None]:
# Merge d_items onto inputevents_cv, and inputevents_mv separately
inputevents_cv = pd.merge(inputevents_cv, d_items, on = 'ITEMID')
inputevents_mv = pd.merge(inputevents_mv, d_items, on = 'ITEMID')

# Heparin Data Organization

In [None]:
patient_info.head()

1\. UNIQUE ADULTS

In [None]:
patient_info = patient_info.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')

word_list = ['Heparin']
heparin_cv = inputevents_cv[inputevents_cv['LABEL'].str.contains('|'.join(word_list))]
heparin_mv = inputevents_mv[inputevents_mv['LABEL'].str.contains('|'.join(word_list))]

# Drop Prophylaxis from heparin_mv
heparin_mv = heparin_mv[heparin_mv['LABEL'] != 'Heparin Sodium (Prophylaxis)']

print(heparin_cv.info())
print(heparin_mv.info())

In [None]:
# Create Groups
gp_cv = heparin_cv.groupby('SUBJECT_ID')
gp_mv = heparin_mv.groupby('SUBJECT_ID')
patient_count = 0

for g in gp_cv:
    patient_count += 1
for g in gp_mv:
    patient_count += 1
print('Total PT recieving UFH: ', patient_count)

In [None]:
# Update patient_info to heparin_patients
patient_list_cv = list(gp_cv.groups.keys())
patient_list_mv = list(gp_mv.groups.keys())
patient_list = patient_list_cv + patient_list_mv

heparin_patients = pd.DataFrame({'SUBJECT_ID' : patient_list})
heparin_patients = pd.merge(heparin_patients, patient_info, on = 'SUBJECT_ID')

2\. aPTT MEASUREMENT

In [None]:
# Find all aPTT measurements
word_list = ['PTT']
aPTT = labevents[labevents['LABEL'].str.contains('|'.join(word_list))]

# Create Groups
gp_ptt = aPTT.groupby('SUBJECT_ID')
patient_count = 0

for g in gp_ptt:
    patient_count += 1

print('Total PT with aPTT tests: ', patient_count)

In [None]:
# Update heparin_patients
patient_list_ptt = list(gp_ptt.groups.keys())
aPTT_patients = pd.DataFrame({'SUBJECT_ID' : patient_list_ptt})
heparin_patients = pd.merge(heparin_patients, aPTT_patients, on = 'SUBJECT_ID')
heparin_patients.info()

In [None]:
ptts = aPTT['VALUE']

def get_aPTT_category(ptt):
    if ptt == '>150':
        return ('SUPRA-TH')
    if ptt == '>150.0':
        return ('SUPRA-TH')
    if ptt == '> 150':
        return ('SUPRA-TH')
    elif len(ptt) > 5:
        return (None)
    elif ptt == 'ERROR':
        return (None)
    ptt = ptt.replace('..', '.')
    ptt = float(ptt)
    if ptt > 100:
        return ('SUPRA-TH')
    elif ptt < 60:
        return('SUB-TH')
    else:
        return('TH')

ptt_categories = []

for ptt in ptts:
    ptt_categories.append(get_aPTT_category(ptt))

ptt_cat_df = pd.DataFrame({'aPTT_CAT' : ptt_categories})
aPTT = aPTT.reset_index(drop = True)
aPTT = pd.concat([aPTT, ptt_cat_df], axis = 1)

3\. NON TRANSFERS

In [None]:
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM SKILLED NUR']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM HOSP/EXTRAM']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != 'TRANSFER FROM OTHER HEALT']
heparin_patients = heparin_patients[heparin_patients.ADMISSION_LOCATION != '** INFO NOT AVAILABLE **']

heparin_patients.info()

4\. FEATURES

In [None]:
#missing - SOFA score is in chartevents (itemid is 20002)
# Dosage Time (adm to dose)
# Measurement Time (dose to aPTT)
# Weight normalized dose

Calculate first aPTT times

In [None]:
first_aPTT = aPTT.drop_duplicates(subset = 'SUBJECT_ID', keep = 'first')
first_aPTT = first_aPTT.rename(columns = {'CHARTTIME': 'aPTT_TIME'})

# Update heparin dosage dataframes
word_list = heparin_patients['SUBJECT_ID'].tolist()
heparin_cv = heparin_cv.loc[heparin_cv['SUBJECT_ID'].isin(word_list)]
heparin_mv = heparin_mv.loc[heparin_mv['SUBJECT_ID'].isin(word_list)]

print(heparin_cv.info())
print(heparin_mv.info())