# Getting my labelled data

#### Script to find:

1) all hospital admissions in which the patient was invasively ventilated in the ICU 

2) in which of these admissions were patients placed back on the ventilator within 48 hours

3) remove patients that died while on the ventilator

4) look for > 2 'invasive ventilation' events in any particular hospital admission

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### read data from the file 'proecdureevents.csv' which contains lists all invasive ventilation procedures occuring in patients in the icu

In [2]:
df = pd.read_csv('../data/raw/procedureevents.csv')
admits = pd.read_csv('../data/raw/admissions.csv')
#df.info()

#### find all 'invasive ventilation' events (identified by itemid code: 225792)

In [3]:
vdf= df[df['itemid']==225792]
vdf['hadm_id'].nunique()

26409

In [4]:
vdf['hadm_id'].count()-vdf['hadm_id'].nunique()

4223

We can see from the above table that there were 30632 ventilation events in the study, from 26409 different hospital admissions. This means that in 4223 of these admissions, the patient was ventilated multiple times.

#### Get list of admissions where the patients was only ventilated once

In [5]:
num_ventilations = vdf['hadm_id'].value_counts()
svdf = vdf[vdf['hadm_id'].isin(num_ventilations.index[num_ventilations == 1])]
svdf

Unnamed: 0,subject_id,hadm_id,stay_id,starttime,endtime,storetime,itemid,value,valueuom,location,...,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,cancelreason,statusdescription,comments_date,originalamount,originalrate
25,13859862,25015072,31891587,2152-04-07 01:50:00,2152-04-10 18:57:00,2152-04-10 19:07:00,225792,89.116667,hour,,...,58.0,,,1,0,0,FinishedRunning,,89.1167,1
78,18917458,28038802,31692394,2185-12-15 21:11:00,2185-12-20 09:59:00,2185-12-21 08:31:00,225792,108.800000,hour,,...,104.0,,,1,0,0,FinishedRunning,,108.8000,1
96,19704964,21790335,30998091,2140-03-10 04:44:00,2140-03-11 13:44:00,2140-03-11 13:44:47.817,225792,33.000000,hour,,...,106.0,,,1,0,0,FinishedRunning,,33.0000,1
105,19004463,24357541,37805052,2177-01-14 02:35:00,2177-02-10 14:00:00,2177-02-11 13:33:00,225792,659.416667,hour,,...,76.1,,,1,0,0,FinishedRunning,,659.4170,1
212,19350792,23255460,34502665,2172-11-17 23:53:00,2172-11-20 08:06:00,2172-11-20 08:06:46.433,225792,56.216667,hour,,...,63.0,,,1,0,0,FinishedRunning,,56.2167,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689692,18667869,29258317,38880976,2126-07-24 23:01:00,2126-07-25 17:00:00,2126-07-25 17:04:00,225792,1079.000000,min,,...,83.5,,,1,0,0,FinishedRunning,,1079.0000,1
689726,14798663,27815009,39028934,2113-10-25 11:35:00,2113-10-26 10:11:00,2113-10-26 10:12:00,225792,1356.000000,min,,...,85.8,,,1,0,0,FinishedRunning,,1356.0000,1
689734,18616832,29021730,30844212,2196-04-16 12:50:00,2196-04-16 15:45:00,2196-04-16 15:53:00,225792,175.000000,min,,...,90.6,,,1,0,0,FinishedRunning,,175.0000,1
689748,17304212,27385559,38342744,2154-06-29 18:29:00,2154-06-29 21:52:00,2154-06-29 22:16:00,225792,203.000000,min,,...,65.0,,,1,0,0,FinishedRunning,,203.0000,1


In [6]:
svdf = svdf.assign(re_intub_class=0)
svdf

Unnamed: 0,subject_id,hadm_id,stay_id,starttime,endtime,storetime,itemid,value,valueuom,location,...,totalamount,totalamountuom,isopenbag,continueinnextdept,cancelreason,statusdescription,comments_date,originalamount,originalrate,re_intub_class
25,13859862,25015072,31891587,2152-04-07 01:50:00,2152-04-10 18:57:00,2152-04-10 19:07:00,225792,89.116667,hour,,...,,,1,0,0,FinishedRunning,,89.1167,1,0
78,18917458,28038802,31692394,2185-12-15 21:11:00,2185-12-20 09:59:00,2185-12-21 08:31:00,225792,108.800000,hour,,...,,,1,0,0,FinishedRunning,,108.8000,1,0
96,19704964,21790335,30998091,2140-03-10 04:44:00,2140-03-11 13:44:00,2140-03-11 13:44:47.817,225792,33.000000,hour,,...,,,1,0,0,FinishedRunning,,33.0000,1,0
105,19004463,24357541,37805052,2177-01-14 02:35:00,2177-02-10 14:00:00,2177-02-11 13:33:00,225792,659.416667,hour,,...,,,1,0,0,FinishedRunning,,659.4170,1,0
212,19350792,23255460,34502665,2172-11-17 23:53:00,2172-11-20 08:06:00,2172-11-20 08:06:46.433,225792,56.216667,hour,,...,,,1,0,0,FinishedRunning,,56.2167,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689692,18667869,29258317,38880976,2126-07-24 23:01:00,2126-07-25 17:00:00,2126-07-25 17:04:00,225792,1079.000000,min,,...,,,1,0,0,FinishedRunning,,1079.0000,1,0
689726,14798663,27815009,39028934,2113-10-25 11:35:00,2113-10-26 10:11:00,2113-10-26 10:12:00,225792,1356.000000,min,,...,,,1,0,0,FinishedRunning,,1356.0000,1,0
689734,18616832,29021730,30844212,2196-04-16 12:50:00,2196-04-16 15:45:00,2196-04-16 15:53:00,225792,175.000000,min,,...,,,1,0,0,FinishedRunning,,175.0000,1,0
689748,17304212,27385559,38342744,2154-06-29 18:29:00,2154-06-29 21:52:00,2154-06-29 22:16:00,225792,203.000000,min,,...,,,1,0,0,FinishedRunning,,203.0000,1,0


#### Get list of admissions where the patients was ventilated a second time within 48 hours

In [7]:
mvdf = vdf[vdf['hadm_id'].isin(num_ventilations.index[num_ventilations == 2])]
sorted_mvdf = mvdf.sort_values(['hadm_id', 'starttime'], ascending=[True, True])  # sort table by hadm_id followed by starttime

sorted_mvdf['starttime'] = pd.to_datetime(sorted_mvdf['starttime']) # convert times to datetime objects
sorted_mvdf['endtime'] = pd.to_datetime(sorted_mvdf['endtime']) 

first_vent = sorted_mvdf.iloc[::2] # slice dataframe so only have first ventilation events
second_vent=sorted_mvdf.iloc[1::2] # slice dataframe so only have second ventilation events

vent_diff = pd.merge(left = first_vent,right = second_vent,on = 'hadm_id')  # merge table so start/endtimes for both ventilaton events from an admission are in the same row for 
vent_diff['timetosecondvent']= vent_diff['starttime_y'] - vent_diff['endtime_x'] # add column of time difference between end of first ventilation event and beginning of second ventilation event 
vent_diff['timetosecondvent'] = vent_diff['timetosecondvent'] / np.timedelta64(1, 'h') # convert time to hours
re_vent = vent_diff[vent_diff['timetosecondvent']<48]

In [8]:
re_vent.columns = ['subject_id_x', 'hadm_id', 'stay_id_x', 'starttime_x', 'endtime',
       'storetime_x', 'itemid_x', 'value', 'valueuom', 'location_x',
       'locationcategory_x', 'orderid_x', 'linkorderid_x',
       'ordercategoryname_x', 'secondaryordercategoryname_x',
       'ordercategorydescription_x', 'patientweight_x', 'totalamount_x',
       'totalamountuom_x', 'isopenbag_x', 'continueinnextdept_x',
       'cancelreason_x', 'statusdescription_x', 'comments_date_x',
       'originalamount_x', 'originalrate_x', 'subject_id_y', 'stay_id_y',
       'starttime_y', 'endtime_y', 'storetime_y', 'itemid_y', 'value_y',
       'valueuom_y', 'location_y', 'locationcategory_y', 'orderid_y',
       'linkorderid_y', 'ordercategoryname_y', 'secondaryordercategoryname_y',
       'ordercategorydescription_y', 'patientweight_y', 'totalamount_y',
       'totalamountuom_y', 'isopenbag_y', 'continueinnextdept_y',
       'cancelreason_y', 'statusdescription_y', 'comments_date_y',
       'originalamount_y', 'originalrate_y', 'timetosecondvent']

In [9]:
re_vent = re_vent.assign(re_intub_class=1)
re_vent

Unnamed: 0,subject_id_x,hadm_id,stay_id_x,starttime_x,endtime,storetime_x,itemid_x,value,valueuom,location_x,...,totalamountuom_y,isopenbag_y,continueinnextdept_y,cancelreason_y,statusdescription_y,comments_date_y,originalamount_y,originalrate_y,timetosecondvent,re_intub_class
2,17223646,20023461,36478017,2152-11-03 11:10:00,2152-11-03 16:44:00,2152-11-03 16:45:00,225792,334.0,min,,...,,1,0,0,FinishedRunning,,6831.0,1,0.766667,1
3,16779215,20024229,38542512,2132-08-27 16:00:00,2132-08-28 17:30:00,2132-08-28 17:43:00,225792,1530.0,min,,...,,1,0,0,FinishedRunning,,5554.0,1,34.716667,1
4,13605251,20025172,38493547,2139-12-31 14:02:00,2140-01-04 11:09:00,2140-01-04 11:09:00,225792,5587.0,min,,...,,1,0,0,FinishedRunning,,23548.0,1,21.133333,1
5,16976998,20034762,30778603,2164-06-07 08:10:00,2164-06-08 14:00:00,2164-06-08 17:51:00,225792,1790.0,min,,...,,1,0,0,FinishedRunning,,1901.0,1,45.183333,1
6,15878712,20035700,30421802,2123-02-01 10:22:00,2123-02-05 14:52:00,2123-02-05 18:01:00,225792,6030.0,min,,...,,1,0,0,FinishedRunning,,2014.0,1,34.633333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2337,16691924,29960248,32296652,2146-12-06 12:55:00,2146-12-09 14:23:00,2146-12-09 14:25:00,225792,4408.0,min,,...,,1,0,0,FinishedRunning,,3136.0,1,10.283333,1
2338,16796371,29962016,38636381,2135-10-21 09:40:00,2135-10-30 11:45:00,2135-10-30 12:27:00,225792,13085.0,min,,...,,1,0,0,FinishedRunning,,2842.0,1,0.583333,1
2341,10020944,29974575,33972149,2131-02-27 17:00:00,2131-03-03 19:54:00,2131-03-04 09:55:00,225792,5934.0,min,,...,,1,0,0,FinishedRunning,,5446.0,1,0.283333,1
2343,19652570,29987115,35887309,2148-02-17 14:00:00,2148-02-19 10:00:00,2148-02-19 10:02:00,225792,2640.0,min,,...,,1,0,0,FinishedRunning,,395.0,1,26.350000,1


In [10]:
sample_vents = pd.concat([svdf[['hadm_id','endtime','value','valueuom','re_intub_class']],re_vent[['hadm_id','endtime','value','valueuom','re_intub_class']]])

In [11]:
sample_vents

Unnamed: 0,hadm_id,endtime,value,valueuom,re_intub_class
25,25015072,2152-04-10 18:57:00,89.116667,hour,0
78,28038802,2185-12-20 09:59:00,108.800000,hour,0
96,21790335,2140-03-11 13:44:00,33.000000,hour,0
105,24357541,2177-02-10 14:00:00,659.416667,hour,0
212,23255460,2172-11-20 08:06:00,56.216667,hour,0
...,...,...,...,...,...
2337,29960248,2146-12-09 14:23:00,4408.000000,min,1
2338,29962016,2135-10-30 11:45:00,13085.000000,min,1
2341,29974575,2131-03-03 19:54:00,5934.000000,min,1
2343,29987115,2148-02-19 10:00:00,2640.000000,min,1


#### Join sample_vents and admissions table together

In [12]:
merged_inner = pd.merge(left= sample_vents, right=admits, left_on='hadm_id', right_on='hadm_id')
merged_inner

Unnamed: 0,hadm_id,endtime,value,valueuom,re_intub_class,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,25015072,2152-04-10 18:57:00,89.116667,hour,0,13859862,2152-03-27 12:44:00,2152-04-11 03:00:00,2152-04-11 03:00:00,EW EMER.,EMERGENCY ROOM,DIED,Medicare,ENGLISH,WIDOWED,WHITE,2152-03-27 09:46:00,2152-03-27 14:17:00,1
1,28038802,2185-12-20 09:59:00,108.800000,hour,0,18917458,2185-12-15 00:17:00,2185-12-22 17:15:00,,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,MARRIED,WHITE,2185-12-14 23:05:00,2185-12-15 01:31:00,0
2,21790335,2140-03-11 13:44:00,33.000000,hour,0,19704964,2140-03-09 14:44:00,2140-03-15 13:00:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,UNABLE TO OBTAIN,,,0
3,24357541,2177-02-10 14:00:00,659.416667,hour,0,19004463,2176-12-16 18:11:00,2177-02-15 17:00:00,,URGENT,TRANSFER FROM HOSPITAL,REHAB,Medicare,ENGLISH,MARRIED,WHITE,,,0
4,23255460,2172-11-20 08:06:00,56.216667,hour,0,19350792,2172-11-17 17:21:00,2172-11-20 06:00:00,2172-11-20 06:00:00,EW EMER.,EMERGENCY ROOM,DIED,Medicare,?,SINGLE,ASIAN,2172-11-17 15:05:00,2172-11-17 18:39:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24674,29960248,2146-12-09 14:23:00,4408.000000,min,1,16691924,2146-11-30 16:09:00,2146-12-17 17:04:00,,DIRECT EMER.,CLINIC REFERRAL,SKILLED NURSING FACILITY,Medicare,ENGLISH,WIDOWED,WHITE,,,0
24675,29962016,2135-10-30 11:45:00,13085.000000,min,1,16796371,2135-10-21 08:16:00,2135-11-04 13:30:00,,EW EMER.,EMERGENCY ROOM,REHAB,Medicare,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2135-10-21 05:47:00,2135-10-21 09:45:00,0
24676,29974575,2131-03-03 19:54:00,5934.000000,min,1,10020944,2131-02-27 15:34:00,2131-03-13 17:01:00,,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,,UNKNOWN,2131-02-27 13:16:00,2131-02-27 16:40:00,0
24677,29987115,2148-02-19 10:00:00,2640.000000,min,1,19652570,2148-02-16 17:42:00,2148-02-20 19:00:00,2148-02-20 19:00:00,URGENT,TRANSFER FROM HOSPITAL,DIED,Other,ENGLISH,,UNKNOWN,,,1


#### Find patients that died within 24 hours of being taken off the ventilator

In [13]:
merged_inner['endtime'] = pd.to_datetime(merged_inner['endtime'])  # convert to datetime object
merged_inner['deathtime'] = pd.to_datetime(merged_inner['deathtime'])
merged_inner['time_to_death']= merged_inner['endtime']-merged_inner['deathtime']  # find time difference between death and being taken off vent
merged_inner['hours_to_death'] = merged_inner['time_to_death'] / np.timedelta64(1, 'h') # convert time to hours

In [14]:
dead_patients = merged_inner[merged_inner['hours_to_death'].notnull()]  # find patients that died

In [15]:
died_on_vent = dead_patients[(dead_patients['hours_to_death']<24)&(dead_patients['hours_to_death']>-24)] # find patients that died while on the ventilator

In [16]:
dead_ids = died_on_vent['hadm_id'].tolist()
not_dead_vents = merged_inner[~merged_inner.hadm_id.isin(dead_ids)]

#### make sure time on vent is all in hours and only use data from when patients were on the vent for more than 6 hours

In [17]:
svents_keep = not_dead_vents.copy()           

In [18]:
svents_keep['value'] = np.where(svents_keep['valueuom']=='min',(svents_keep['value']/60),svents_keep['value'])
svents_keep['value'] = np.where(svents_keep['valueuom']=='day',(svents_keep['value']*24),svents_keep['value'])
svents_keep.drop(['valueuom','time_to_death'],axis=1,inplace=True)
svents_keep.rename(columns = {"value":"time_on_vent"},inplace=True)

In [19]:
svents_keep

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,hours_to_death
1,28038802,2185-12-20 09:59:00,108.800000,0,18917458,2185-12-15 00:17:00,2185-12-22 17:15:00,NaT,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,MARRIED,WHITE,2185-12-14 23:05:00,2185-12-15 01:31:00,0,
2,21790335,2140-03-11 13:44:00,33.000000,0,19704964,2140-03-09 14:44:00,2140-03-15 13:00:00,NaT,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,UNABLE TO OBTAIN,,,0,
3,24357541,2177-02-10 14:00:00,659.416667,0,19004463,2176-12-16 18:11:00,2177-02-15 17:00:00,NaT,URGENT,TRANSFER FROM HOSPITAL,REHAB,Medicare,ENGLISH,MARRIED,WHITE,,,0,
5,22188993,2145-11-04 18:40:00,143.666667,0,11538389,2145-10-29 12:46:00,2145-11-09 17:30:00,2145-11-09 17:30:00,EW EMER.,EMERGENCY ROOM,DIED,Medicare,ENGLISH,MARRIED,UNKNOWN,2145-10-29 11:07:00,2145-10-29 14:00:00,1,-118.833333
7,21880799,2134-05-22 17:58:00,54.966667,0,16606203,2134-05-20 09:51:00,2134-05-22 18:00:00,NaT,EW EMER.,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicare,?,MARRIED,HISPANIC/LATINO,2134-05-20 06:11:00,2134-05-20 11:15:00,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24674,29960248,2146-12-09 14:23:00,73.466667,1,16691924,2146-11-30 16:09:00,2146-12-17 17:04:00,NaT,DIRECT EMER.,CLINIC REFERRAL,SKILLED NURSING FACILITY,Medicare,ENGLISH,WIDOWED,WHITE,,,0,
24675,29962016,2135-10-30 11:45:00,218.083333,1,16796371,2135-10-21 08:16:00,2135-11-04 13:30:00,NaT,EW EMER.,EMERGENCY ROOM,REHAB,Medicare,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,2135-10-21 05:47:00,2135-10-21 09:45:00,0,
24676,29974575,2131-03-03 19:54:00,98.900000,1,10020944,2131-02-27 15:34:00,2131-03-13 17:01:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,,UNKNOWN,2131-02-27 13:16:00,2131-02-27 16:40:00,0,
24677,29987115,2148-02-19 10:00:00,44.000000,1,19652570,2148-02-16 17:42:00,2148-02-20 19:00:00,2148-02-20 19:00:00,URGENT,TRANSFER FROM HOSPITAL,DIED,Other,ENGLISH,,UNKNOWN,,,1,-33.000000


In [20]:
svents_keep = svents_keep[(svents_keep['time_on_vent']>6)]

In [21]:
svents_keep.columns

Index(['hadm_id', 'endtime', 'time_on_vent', 're_intub_class', 'subject_id',
       'admittime', 'dischtime', 'deathtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'marital_status', 'ethnicity', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'hours_to_death'],
      dtype='object')

#### save to csv

In [22]:
#svents_keep.to_csv('../data/processed/sample_vents.csv',index= False)