# EXPLORING THE RELATIONSHIP BETWEEN CHART EVENTS AND TIME TO ICU STAY
Note: this dataset is open-source, but restricted access. You must request access via https://mimic.physionet.org/gettingstarted/access/. I downloaded the dataset onto my personal computer and ran the analyses locally. Given HIPAA and privacy considerations, I will only show summary plots from data in the database (no patient-specific information will be displayed.

This notebook utilizes previously analyzed data that looks at chart events (e.g., vital signs, lab results, etc.) that occurred between hospital admission and ICU stay to test whether certain events/measurements may predict "imminent" (e.g., <=1 day) ICU stays.<br>

Briefly, data from the PATIENTS, ADMISSIONS, ICUSTAYS, PRESCRIPTIONS, and CHARTEVENTS databases are merged based on subject ID, hospital admission ID, and ICU stay (only drugs prescribed and chart events recorded after hospital admission time and before ICU stay were included in the dataframe). Finally, 
# NEED TO COMPLETE 
<br>

The code to perform these analyses can be found on my github page (https://github.com/adamgiffordphd/imminent_icu_stays). The code includes functionality to parallel process the analysis to get through all ~330M rows in CHARTEVENTS.csv. This code was run on a private server with 40 processors.

In [30]:
import pickle
import glob
from numpy import append, unique, mean

In [2]:
# there are ~3300 pickle files that contain the data that is described above
# this cell finds the pickle files in the saved data directory
# drugs.pickle is a list of all unique drugs in the dataset
pckl_files = glob.glob("pickle/20200811/*.pickle")

In [5]:
df = pickle.load(open(pckl_files[1],'rb'))
df['SAMEDAY_CHRT_TO_ICU'] = df['DAYS_CHRT_TO_ICU'].apply(lambda x: int(x<=1))

In [6]:
df_bySubjAdICU = df.groupby(['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).agg({'ITEMID': [list],'DAYS_CHRT_TO_ICU': [sum],'SAMEDAY_CHRT_TO_ICU': [sum]})

In [14]:
append(df_bySubjAdICU.loc[(19185, 103196, 281157)][('ITEMID','list')],df_bySubjAdICU.loc[(19185, 103196, 281157)][('ITEMID','list')])

array([ 772,  781,  786,  787,  788,  791,  811,  813,  814,  821,  828,
        829,  833,  837,  861, 1127, 1162, 1521, 1522, 1523, 1525, 1529,
       1532, 1535, 1536, 1542,  772,  781,  786,  787,  788,  791,  811,
        813,  814,  821,  828,  829,  833,  837,  861, 1127, 1162, 1521,
       1522, 1523, 1525, 1529, 1532, 1535, 1536, 1542])

In [7]:
df_bySubjAdICU[('SUBJECT_HADM_ICU_ID','count')] = df_bySubjAdICU[('ITEMID','list')].apply(lambda x: len(x))
df_bySubjAdICU[('ITEMID','list')] = df_bySubjAdICU[('ITEMID','list')].apply(lambda x: unique(x))
df_bySubjAdICU

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ITEMID,DAYS_CHRT_TO_ICU,SAMEDAY_CHRT_TO_ICU,SUBJECT_HADM_ICU_ID
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,sum,sum,count
SUBJECT_ID,HADM_ID,ICUSTAY_ID,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
19185,103196,281157,"[772, 781, 786, 787, 788, 791, 811, 813, 814, ...",29.328542,1014,1014
19246,100942,212853,"[742, 1125]",0.395046,46,46
19246,103522,239485,"[27, 31, 32, 54, 70, 71, 72, 77, 80, 82, 83, 8...",15.052222,1488,1488
19246,124035,203260,"[781, 787, 788, 791, 811, 813, 814, 815, 824, ...",49.040833,504,504
19246,129654,274628,"[824, 828, 829, 837, 861, 1127, 1162, 1286, 15...",9.646875,315,315
19246,150429,283770,"[781, 784, 786]",16.4475,102,102
19310,157811,250035,"[27, 31, 32, 39, 40, 50, 52, 54, 69, 80, 82, 8...",36.788889,1344,1344
19316,198168,257555,"[916, 917, 919, 920, 924, 925, 926, 927, 930, ...",0.102083,180,180


In [8]:
df_bySubjAdICU.columns

MultiIndex([(             'ITEMID',  'list'),
            (   'DAYS_CHRT_TO_ICU',   'sum'),
            ('SAMEDAY_CHRT_TO_ICU',   'sum'),
            ('SUBJECT_HADM_ICU_ID', 'count')],
           )

In [15]:
# combine the data across pickle files
'''note: have to load and combine the data in batches because the resulting dataframe would be too
large. will do in batches of ~50, and compute running stats for visualization and assessment'''

for st_ix in range(0,len(pckl_files),50):
    print(st_ix)
    if st_ix + 50 > len(pckl_files):
        en_ix = len(pckl_files)
    else:
        en_ix = st_ix + 50
        
    for f_ix in range(st_ix,en_ix):
        if f_ix==st_ix:
            df = pickle.load(open(pckl_files[f_ix],'rb'))
        else:
            tmp = pickle.load(open(pckl_files[f_ix],'rb'))
            df = df.append(tmp)
            
    df['SAMEDAY_CHRT_TO_ICU'] = df['DAYS_CHRT_TO_ICU'].apply(lambda x: int(x<=1))
    if st_ix==0: 
        df_bySubjAdICU = df.groupby(['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).agg({'ITEMID': [list],'DAYS_CHRT_TO_ICU': [sum],'SAMEDAY_CHRT_TO_ICU': [sum]})
        df_bySubjAdICU[('SUBJECT_HADM_ICU_ID','count')] = df_bySubjAdICU[('ITEMID','list')].apply(lambda x: len(x))
        df_bySubjAdICU[('ITEMID','list')] = df_bySubjAdICU[('ITEMID','list')].apply(lambda x: unique(x))
    else:
        tmp = df.groupby(['SUBJECT_ID','HADM_ID','ICUSTAY_ID']).agg({'ITEMID': [list],'DAYS_CHRT_TO_ICU': [sum],'SAMEDAY_CHRT_TO_ICU': [sum]})
        tmp[('SUBJECT_HADM_ICU_ID','count')] = tmp[('ITEMID','list')].apply(lambda x: len(x))
        tmp[('ITEMID','list')] = tmp[('ITEMID','list')].apply(lambda x: unique(x))
        
        # check if ['SUBJECT_ID','HADM_ID','ICUSTAY_ID'] is in previous files and current file, combine if so
        for m_ix in tmp.index:
            if df_bySubjAdICU.index.isin([m_ix]).any():
                # append list and take unique
                df_bySubjAdICU.loc[m_ix][('ITEMID','list')] = unique(append(df_bySubjAdICU.loc[m_ix][('ITEMID','list')], tmp.loc[m_ix][('ITEMID','list')]))
                
                # sum the rest of the columns
                df_bySubjAdICU.loc[m_ix][('DAYS_CHRT_TO_ICU','sum')] = df_bySubjAdICU.loc[m_ix][('DAYS_CHRT_TO_ICU','sum')]+tmp.loc[m_ix][('DAYS_CHRT_TO_ICU','sum')]
                df_bySubjAdICU.loc[m_ix][('SAMEDAY_CHRT_TO_ICU','sum')] = df_bySubjAdICU.loc[m_ix][('SAMEDAY_CHRT_TO_ICU','sum')]+tmp.loc[m_ix][('SAMEDAY_CHRT_TO_ICU','sum')]
                df_bySubjAdICU.loc[m_ix][('SUBJECT_HADM_ICU_ID','count')] = df_bySubjAdICU.loc[m_ix][('SUBJECT_HADM_ICU_ID','count')]+tmp.loc[m_ix][('SUBJECT_HADM_ICU_ID','count')]


0
50


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300


In [18]:
df_bySubjAdICU.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ITEMID,DAYS_CHRT_TO_ICU,SAMEDAY_CHRT_TO_ICU,SUBJECT_HADM_ICU_ID
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,sum,sum,count
SUBJECT_ID,HADM_ID,ICUSTAY_ID,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
123,195632,227264.0,"[8381, 8392, 8393]",2.133924,111,111
199,125590,235675.0,"[226381, 226543, 226544, 227239, 227241, 227378]",44.083333,288,288
256,188869,254418.0,"[226381, 226543, 227378]",0.043681,111,111
266,186251,293876.0,"[226381, 226543, 226544, 227378]",0.036204,136,136
422,117029,299666.0,"[220048, 223753, 223781, 223782, 223783, 22378...",0.194074,32,32


In [21]:
df_bySubjAdICU[('DAYS_CHRT_TO_ICU','mean')] = df_bySubjAdICU[('DAYS_CHRT_TO_ICU','sum')] / df_bySubjAdICU[('SUBJECT_HADM_ICU_ID','count')]
df_bySubjAdICU[('SAMEDAY_CHRT_TO_ICU','mean')] = df_bySubjAdICU[('SAMEDAY_CHRT_TO_ICU','sum')] / df_bySubjAdICU[('SUBJECT_HADM_ICU_ID','count')]
df_bySubjAdICU.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ITEMID,DAYS_CHRT_TO_ICU,SAMEDAY_CHRT_TO_ICU,SUBJECT_HADM_ICU_ID,DAYS_CHRT_TO_ICU,SAMEDAY_CHRT_TO_ICU
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,sum,sum,count,mean,mean
SUBJECT_ID,HADM_ID,ICUSTAY_ID,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
123,195632,227264.0,"[8381, 8392, 8393]",2.133924,111,111,0.019225,1.0
199,125590,235675.0,"[226381, 226543, 226544, 227239, 227241, 227378]",44.083333,288,288,0.153067,1.0
256,188869,254418.0,"[226381, 226543, 227378]",0.043681,111,111,0.000394,1.0
266,186251,293876.0,"[226381, 226543, 226544, 227378]",0.036204,136,136,0.000266,1.0
422,117029,299666.0,"[220048, 223753, 223781, 223782, 223783, 22378...",0.194074,32,32,0.006065,1.0


In [22]:
foo = dict()

In [25]:
foo['bar'] = [1]
foo['bar']

[1]

In [26]:
foo['bar'].append([2])

In [28]:
item_dict = dict()
for m_ix in df_bySubjAdICU.index:
    items = df_bySubjAdICU.loc[m_ix][('ITEMID','list')]
    for item in items:
        if item not in item_dict.keys():
            item_dict[item] = [df_bySubjAdICU.loc[m_ix][('DAYS_CHRT_TO_ICU','mean')]]
        else:
            item_dict[item].append([df_bySubjAdICU.loc[m_ix][('DAYS_CHRT_TO_ICU','mean')]])

In [31]:
item_dict_mn = dict()
for key in item_dict.keys():
    item_dict_mn[key] = mean(item_dict[key])

In [66]:
import numpy as np
item_ids = np.array([i for i in item_dict_mn.keys()])
item_mn_days = np.array([i for i in item_dict_mn.values()])

In [67]:
ix = np.argsort(item_mn_days)

In [68]:
ix

array([708, 755, 753, 754,  29,  25,  24,  23,  22,  28, 641, 640, 659,
       650, 642, 687, 685, 683, 696, 656, 643, 644, 645, 647, 657, 649,
       648, 652, 653, 654, 655, 661, 660, 651, 658, 698, 699, 700, 769,
       757, 760, 761, 762, 759, 756, 763, 758, 765, 768, 766, 767, 764,
       540, 697, 692, 691, 734, 736, 737, 739, 735, 738, 688, 686, 684,
       541, 571, 572, 429, 433, 428, 431, 430,  10, 542, 522, 405, 416,
       461, 462, 584, 721,  11, 689, 404,   9, 690, 421, 482, 480, 695,
       715, 717, 244, 243, 732, 435,  17, 694, 436, 438, 439, 312, 413,
       409, 414, 415, 412, 408, 432,  14,  19,  16,  20,  13, 403, 618,
       619,  21, 538, 417, 537, 751, 742, 747, 748, 746, 745, 744, 733,
       740, 741, 743, 463, 464,  91,  96,  97,  98, 434, 437,   8, 470,
       468, 100, 101, 311,   3,   5,  92,  93,  94,  95, 539, 623, 624,
       625, 626, 622, 628, 621, 627, 620, 505, 506,  12, 714, 749,  26,
        27, 466, 410, 418, 716, 718, 719, 720, 596,   4, 423, 48

In [41]:
sorted_item_mn_days = item_mn_days[ix]
sorted_items = item_ids[ix]

TypeError: only integer scalar arrays can be converted to a scalar index