# The Request

Observations every 12 hours for each of the patients in the cohort throughout the month of June 2000. For each of these observations, we need the following data
● The ratio of discharged patients over the previous 4 hours to the number of nurses in the patient’s unit
● The ratio of nurses to patients in the patient’s unit
● The patient’s latest Tropinin lab value (confirm codes)
● The patient’s latest B-type natriuretic peptide (BNP) lab value (LOINC 33762-6)

# The Deliverable
Long DF with one row per patient per observation

In [1]:
import pandas as pd
import sqlite3
import datetime

# Set up the database connection
con = sqlite3.connect("data/DE_Challenge_DB.sqlite")

In [2]:
# build scaffolding for the snapshots
scaffold = pd.date_range(start='2000-06-01 06:59:00', end='2000-06-30 23:59:00', freq='12h')
snapdata = []
careteam = []
discharges = []

## Building out the Unit, Population and Staffing Information

In [3]:
# collect data at each of the given snapshot points
for snapdate in scaffold:
    # where is the patient right now?
    patient_location = pd.read_sql_query(f"""
            SELECT SUBJECT_ID,
            CURR_CAREUNIT as UNIT
            FROM TRANSFERS WHERE INTIME <= '{snapdate}'
            AND OUTTIME > '{snapdate}'
        """, con = con)
    patient_location['snapshot_time'] = snapdate
    snapdata.append(patient_location)

    # who is the care team for each patient right now?
    care_detail = pd.read_sql_query(f"""
            SELECT SUBJECT_ID, tt.CGID, LABEL, DESCRIPTION
            FROM TREATMENT_TEAM tt
            LEFT JOIN CARE_GIVERS cg on tt.CGID = cg.CGID
            WHERE tt.STARTTIME <= '{snapdate}'
            AND tt.ENDTIME > '{snapdate}'
            -- TODO add the CG filter for nurses
        """, con = con)
    care_detail['snapshot_time'] = snapdate
    careteam.append(care_detail)

    # how many patients were discharged from each unit right now?
    discharge_count = pd.read_sql_query(f"""
            SELECT PREV_CAREUNIT as UNIT,
            count(distinct(SUBJECT_ID)) as DISCHARGE_COUNT
            FROM TRANSFERS
            WHERE INTIME >= '{snapdate - datetime.timedelta(hours = 4)}'
              AND INTIME <= '{snapdate}'
             AND EVENTTYPE = 'discharge'
            GROUP BY PREV_CAREUNIT
        """, con = con)
    discharge_count['snapshot_time'] = snapdate
    discharges.append(discharge_count)

In [4]:
snap_df = pd.concat(snapdata)
care_df = pd.concat(careteam)
discharge_df = pd.concat(discharges).groupby(['snapshot_time','UNIT']).sum()

In [5]:
# here we  have a df showing where each patient was at a given time
snap_df.head()

Unnamed: 0,SUBJECT_ID,UNIT,snapshot_time
0,124,,2000-06-01 06:59:00
1,6,SICU,2000-06-01 06:59:00
2,109,,2000-06-01 06:59:00
3,110,NWARD,2000-06-01 06:59:00
4,223,,2000-06-01 06:59:00


In [6]:
care_df.head()

Unnamed: 0,SUBJECT_ID,CGID,LABEL,DESCRIPTION,snapshot_time
0,31842,18576.0,RN,RN,2000-06-01 06:59:00
1,28533,14612.0,RT,Respiratory,2000-06-01 06:59:00
2,31842,21108.0,RN,RN,2000-06-01 06:59:00
3,7482,21336.0,RNs,RN,2000-06-01 06:59:00
4,7482,17735.0,RN,RN,2000-06-01 06:59:00


In [7]:
#displaying this wide for review but we'll actually take this pivot and unpivot it again once the zeroes are in place.
discharge_df.reset_index().pivot(index ='snapshot_time', columns='UNIT', values = 'DISCHARGE_COUNT').fillna(0)

UNIT,CCU,CSRU,MICU,NICU,NWARD,SICU,TSICU
snapshot_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-06-01 06:59:00,0.0,0.0,5.0,0.0,0.0,0.0,0.0
2000-06-01 18:59:00,3.0,0.0,3.0,4.0,4.0,1.0,0.0
2000-06-02 06:59:00,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2000-06-02 18:59:00,3.0,0.0,8.0,4.0,1.0,2.0,1.0
2000-06-03 06:59:00,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2000-06-03 18:59:00,2.0,1.0,7.0,4.0,5.0,1.0,1.0
2000-06-04 06:59:00,0.0,0.0,2.0,1.0,0.0,1.0,0.0
2000-06-04 18:59:00,0.0,2.0,3.0,3.0,1.0,1.0,2.0
2000-06-05 18:59:00,1.0,0.0,5.0,6.0,2.0,1.0,0.0
2000-06-06 06:59:00,0.0,0.0,2.0,0.0,0.0,0.0,1.0


In [8]:
discharge_df

Unnamed: 0_level_0,Unnamed: 1_level_0,DISCHARGE_COUNT
snapshot_time,UNIT,Unnamed: 2_level_1
2000-06-01 06:59:00,MICU,5
2000-06-01 18:59:00,CCU,3
2000-06-01 18:59:00,MICU,3
2000-06-01 18:59:00,NICU,4
2000-06-01 18:59:00,NWARD,4
...,...,...
2000-06-30 18:59:00,MICU,5
2000-06-30 18:59:00,NICU,8
2000-06-30 18:59:00,NWARD,5
2000-06-30 18:59:00,SICU,1


In [9]:
unit_population = snap_df.groupby(['snapshot_time','UNIT'],dropna=False).nunique().rename(columns={'SUBJECT_ID':'PATIENT_COUNT'})
unit_population

Unnamed: 0_level_0,Unnamed: 1_level_0,PATIENT_COUNT
snapshot_time,UNIT,Unnamed: 2_level_1
2000-06-01 06:59:00,CCU,67
2000-06-01 06:59:00,CSRU,85
2000-06-01 06:59:00,MICU,225
2000-06-01 06:59:00,NICU,220
2000-06-01 06:59:00,NWARD,46
...,...,...
2000-06-30 18:59:00,NICU,227
2000-06-30 18:59:00,NWARD,36
2000-06-30 18:59:00,SICU,126
2000-06-30 18:59:00,TSICU,79


In [10]:
# again, pivoting this for display, but not bothering to fill zeros and melt out again.
unit_population.reset_index().pivot(index ='snapshot_time', columns='UNIT', values = 'PATIENT_COUNT').fillna(0)

UNIT,NaN,CCU,CSRU,MICU,NICU,NWARD,SICU,TSICU
snapshot_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-06-01 06:59:00,816,67,85,225,220,46,129,84
2000-06-01 18:59:00,766,63,93,204,213,38,124,74
2000-06-02 06:59:00,802,68,91,197,220,43,133,75
2000-06-02 18:59:00,760,66,86,188,221,34,128,76
2000-06-03 06:59:00,803,65,86,193,220,39,126,76
2000-06-03 18:59:00,772,53,90,191,222,37,119,72
2000-06-04 06:59:00,813,59,88,211,224,40,125,73
2000-06-04 18:59:00,748,51,85,218,224,43,127,74
2000-06-05 06:59:00,791,53,83,234,229,50,140,77
2000-06-05 18:59:00,712,54,96,224,225,51,132,74


In [11]:
caregiver_population = snap_df.set_index(['SUBJECT_ID', 'snapshot_time'])\
    .join(care_df.set_index(['SUBJECT_ID', 'snapshot_time']))\
    .dropna(subset = ['CGID'], axis=0)\
    .reset_index()\
    .drop(columns=['SUBJECT_ID','LABEL', 'DESCRIPTION'])\
    .groupby(['snapshot_time','UNIT'], dropna = False)\
    .nunique().rename(columns={'CGID':'TEAM_COUNT'})
caregiver_population

Unnamed: 0_level_0,Unnamed: 1_level_0,TEAM_COUNT
snapshot_time,UNIT,Unnamed: 2_level_1
2000-06-01 06:59:00,CCU,47
2000-06-01 06:59:00,CSRU,66
2000-06-01 06:59:00,MICU,149
2000-06-01 06:59:00,NICU,123
2000-06-01 06:59:00,NWARD,13
...,...,...
2000-06-30 18:59:00,NICU,159
2000-06-30 18:59:00,NWARD,17
2000-06-30 18:59:00,SICU,82
2000-06-30 18:59:00,TSICU,57


In [12]:
# one more time, just showing this wide but will work with the data long.
caregiver_population.reset_index().pivot(index ='snapshot_time', columns='UNIT', values = 'TEAM_COUNT').fillna(0)

UNIT,NaN,CCU,CSRU,MICU,NICU,NWARD,SICU,TSICU
snapshot_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-06-01 06:59:00,11,47,66,149,123,13,101,78
2000-06-01 18:59:00,53,41,63,165,158,12,90,57
2000-06-02 06:59:00,17,37,68,140,114,6,99,67
2000-06-02 18:59:00,28,39,58,156,152,7,95,56
2000-06-03 06:59:00,8,41,63,137,122,10,90,67
2000-06-03 18:59:00,29,40,65,147,164,17,78,64
2000-06-04 06:59:00,7,36,71,147,131,5,87,68
2000-06-04 18:59:00,23,43,72,169,150,15,81,49
2000-06-05 06:59:00,14,32,60,152,121,9,95,71
2000-06-05 18:59:00,57,44,69,179,159,13,92,47


In [13]:
unit_measures = pd.concat([unit_population, discharge_df, caregiver_population], axis = 1).fillna(0)
unit_measures['nurse_discharge_ratio'] = unit_measures.DISCHARGE_COUNT / unit_measures.PATIENT_COUNT
unit_measures

Unnamed: 0_level_0,Unnamed: 1_level_0,PATIENT_COUNT,DISCHARGE_COUNT,TEAM_COUNT,nurse_discharge_ratio
snapshot_time,UNIT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-06-01 06:59:00,CCU,67,0.0,47,0.000000
2000-06-01 06:59:00,CSRU,85,0.0,66,0.000000
2000-06-01 06:59:00,MICU,225,5.0,149,0.022222
2000-06-01 06:59:00,NICU,220,0.0,123,0.000000
2000-06-01 06:59:00,NWARD,46,0.0,13,0.000000
...,...,...,...,...,...
2000-06-28 18:59:00,,758,0.0,58,0.000000
2000-06-29 06:59:00,,809,0.0,12,0.000000
2000-06-29 18:59:00,,764,0.0,28,0.000000
2000-06-30 06:59:00,,785,0.0,15,0.000000


## Building out the cohort's test result data

In [14]:
patient_cohort = pd.read_sql_query("""
        SELECT DISTINCT pts.SUBJECT_ID
        FROM (SELECT DISTINCT SUBJECT_ID, HADM_ID
              FROM TRANSFERS T
              WHERE T.EVENTTYPE = 'admit'
                AND INTIME BETWEEN '2000-06-01' AND '2000-07-01') pts
         INNER JOIN
         -- TODO Validate Lab Codes
             (SELECT *
              FROM LABS
                       INNER JOIN ICD_LABS ON LABS.ITEMID = ICD_LABS.ITEMID AND lower(ICD_LABS.label) like 'tropo%') tropo
             ON pts.SUBJECT_ID = tropo.SUBJECT_ID
                 AND pts.HADM_ID = tropo.HADM_ID
        INNER JOIN
        -- TODO Validate Diagnoses
            (SELECT * from DIAGNOSES
             WHERE (SUBSTR(DIAGNOSES.ICD9_CODE, 1, 3) in
            ('390', '391', '392', --acute rheumatic
             '393', '394', '395', '396', '397', '398', --chronic rheumatic
             '410', '411', '412', '413', '414', -- ischemic hd
             '415', '416', '417', -- pulmonary circulation
             '420', '421', '422', '423', '424', '425', '426', '427', '428', '429'))-- other hd
                ) dx
            ON pts.SUBJECT_ID = dx.SUBJECT_ID AND pts.HADM_ID = dx.HADM_ID
    """, con = con)

In [15]:
total_patients = con.execute("SELECT COUNT(DISTINCT SUBJECT_ID) from PATIENTS").fetchone()[0]
print(f'''Total Number of Patients in Dataset: {total_patients}. Patient cohort size: {len(patient_cohort)}
Cohort represents {len(patient_cohort)/total_patients:.1%} of all patients in dataset.''')

Total Number of Patients in Dataset: 46488. Patient cohort size: 1433
Cohort represents 3.1% of all patients in dataset.


In [16]:
def collect_latest_measure(lab_df, pt, threshold_date):
    lab_df = lab_df.loc[lab_df['SUBJECT_ID']==pt]
    if len(lab_df) == 0:
        return None
    else:
        try:
            latest_index = lab_df.loc[lab_df['CHARTTIME'] < threshold_date]['CHARTTIME'].idxmax()
            return dict(lab_df.loc[latest_index])
        except ValueError:
            return None


In [23]:
cohort_labs = []
tests = {'tropinin': ['6598-7', '10839-9'],
         'BNP': ['33762-6', '33762-6']} # hackery, I know. makes it possible to swap in 'tuple(foo)' in the query

for test in tests:
    patient_labs = pd.read_sql_query(f"""SELECT LABS.SUBJECT_ID, CHARTTIME, VALUENUM, VALUEUOM, LABEL FROM LABS
                       INNER JOIN ICD_LABS ON LABS.ITEMID = ICD_LABS.ITEMID AND ICD_LABS.LOINC_CODE in {tuple(tests[test])}
                       WHERE SUBJECT_ID IN {tuple(patient_cohort['SUBJECT_ID'])}""", con = con, parse_dates=['CHARTTIME'])
    patient_labs['test_family'] = test
    for patient in patient_cohort['SUBJECT_ID']:
        for snapdate in scaffold:
            latest_test = collect_latest_measure(patient_labs, patient, snapdate)
            if latest_test is None:
                latest_test = {'SUBJECT_ID':patient,
                              'CHARTTIME': None,
                              'VALUENUM': None,
                              'VALUEUOM': None,
                              'LABEL': None,
                              'test_family': test}
            latest_test['snapshot_time'] = snapdate
            cohort_labs.append(latest_test)

In [24]:
cohort_labs = pd.DataFrame(cohort_labs).set_index(['snapshot_time', 'SUBJECT_ID'])

In [25]:
cohort_labs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CHARTTIME,VALUENUM,VALUEUOM,LABEL,test_family
snapshot_time,SUBJECT_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-01 06:59:00,112,NaT,,,,tropinin
2000-06-01 18:59:00,112,NaT,,,,tropinin
2000-06-02 06:59:00,112,NaT,,,,tropinin
2000-06-02 18:59:00,112,NaT,,,,tropinin
2000-06-03 06:59:00,112,NaT,,,,tropinin


In [30]:
snap_cohort_df = snap_df[snap_df.SUBJECT_ID.isin(patient_cohort['SUBJECT_ID'])].set_index(['snapshot_time','UNIT'])
snap_cohort_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUBJECT_ID
snapshot_time,UNIT,Unnamed: 2_level_1
2000-06-01 06:59:00,,124
2000-06-01 06:59:00,MICU,2828
2000-06-01 06:59:00,,4159
2000-06-01 06:59:00,,3866
2000-06-01 06:59:00,CSRU,3868


In [35]:
pd.concat([snap_cohort_df, unit_measures], axis=1)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [39]:
snap_cohort_df.index

MultiIndex([('2000-06-01 06:59:00',    nan),
            ('2000-06-01 06:59:00', 'MICU'),
            ('2000-06-01 06:59:00',    nan),
            ('2000-06-01 06:59:00',    nan),
            ('2000-06-01 06:59:00', 'CSRU'),
            ('2000-06-01 06:59:00',  'CCU'),
            ('2000-06-01 06:59:00',    nan),
            ('2000-06-01 06:59:00', 'MICU'),
            ('2000-06-01 06:59:00',    nan),
            ('2000-06-01 06:59:00', 'MICU'),
            ...
            ('2000-06-30 18:59:00',  'CCU'),
            ('2000-06-30 18:59:00', 'SICU'),
            ('2000-06-30 18:59:00',    nan),
            ('2000-06-30 18:59:00', 'CSRU'),
            ('2000-06-30 18:59:00',    nan),
            ('2000-06-30 18:59:00', 'CSRU'),
            ('2000-06-30 18:59:00',    nan),
            ('2000-06-30 18:59:00',    nan),
            ('2000-06-30 18:59:00', 'MICU'),
            ('2000-06-30 18:59:00',  'CCU')],
           names=['snapshot_time', 'UNIT'], length=23871)