# Getting my labelled data

#### Script to find:

1) all hospital admissions in which the patient was invasively ventilated in the ICU 

2) in which of these admissions were patients placed back on the ventilator within 48 hours

3) remove patients that died while on the ventilator

4) look for > 2 'invasive ventilation' events in any particular hospital admission

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### read data from the file 'proecdureevents.csv' which contains lists all invasive ventilation procedures occuring in patients in the icu

In [2]:
df = pd.read_csv('procedureevents.csv')
#df.info()

#### find all 'invasive ventilation' events (identified by itemid code: 225792)

In [3]:
vdf= df[df['itemid']==225792]
vdf['hadm_id'].nunique()

26409

In [4]:
vdf['hadm_id'].count()-vdf['hadm_id'].nunique()

4223

We can see from the above table that there were 30632 ventilation events in the study, from 26409 different hospital admissions. This means that in 4223 of these admissions, the patient was ventilated multiple times.

#### Get list of admissions where the patients was only ventilated once

In [5]:
num_ventilations = vdf['hadm_id'].value_counts()
svdf = vdf[vdf['hadm_id'].isin(num_ventilations.index[num_ventilations == 1])]

#### Get list of admissions where the patients was ventilated a second time within 48 hours

In [6]:
mvdf = vdf[vdf['hadm_id'].isin(num_ventilations.index[num_ventilations == 2])]
sorted_mvdf = mvdf.sort_values(['hadm_id', 'starttime'], ascending=[True, True])  # sort table by hadm_id followed by starttime

sorted_mvdf['starttime'] = pd.to_datetime(sorted_mvdf['starttime']) # convert times to datetime objects
sorted_mvdf['endtime'] = pd.to_datetime(sorted_mvdf['endtime']) 

first_vent = sorted_mvdf.iloc[::2] # slice dataframe so only have first ventilation events
second_vent=sorted_mvdf.iloc[1::2] # slice dataframe so only have second ventilation events

vent_diff = pd.merge(left = first_vent,right = second_vent,on = 'hadm_id')  # merge table so start/endtimes for both ventilaton events from an admission are in the same row for 
vent_diff['timetosecondvent']= vent_diff['starttime_y'] - vent_diff['endtime_x'] # add column of time difference between end of first ventilation event and beginning of second ventilation event 
vent_diff['timetosecondvent'] = vent_diff['timetosecondvent'] / np.timedelta64(1, 'h') # convert time to hours
re_vent = vent_diff[vent_diff['timetosecondvent']<48]

In [18]:
re_vent.columns = ['subject_id_x', 'hadm_id', 'stay_id_x', 'starttime_x', 'endtime',
       'storetime_x', 'itemid_x', 'value', 'valueuom', 'location_x',
       'locationcategory_x', 'orderid_x', 'linkorderid_x',
       'ordercategoryname_x', 'secondaryordercategoryname_x',
       'ordercategorydescription_x', 'patientweight_x', 'totalamount_x',
       'totalamountuom_x', 'isopenbag_x', 'continueinnextdept_x',
       'cancelreason_x', 'statusdescription_x', 'comments_date_x',
       'originalamount_x', 'originalrate_x', 'subject_id_y', 'stay_id_y',
       'starttime_y', 'endtime_y', 'storetime_y', 'itemid_y', 'value_y',
       'valueuom_y', 'location_y', 'locationcategory_y', 'orderid_y',
       'linkorderid_y', 'ordercategoryname_y', 'secondaryordercategoryname_y',
       'ordercategorydescription_y', 'patientweight_y', 'totalamount_y',
       'totalamountuom_y', 'isopenbag_y', 'continueinnextdept_y',
       'cancelreason_y', 'statusdescription_y', 'comments_date_y',
       'originalamount_y', 'originalrate_y', 'timetosecondvent']

In [38]:
sample_vents = pd.concat([svdf[['hadm_id','endtime','value','valueuom']],re_vent[['hadm_id','endtime','value','valueuom']]])

In [43]:
sample_vents.to_csv('sample_vents.csv',index= False)

In [44]:
svents = pd.read_csv('sample_vents.csv')

In [45]:
svents

Unnamed: 0,hadm_id,endtime,value,valueuom
0,25015072,2152-04-10 18:57:00,89.116667,hour
1,28038802,2185-12-20 09:59:00,108.800000,hour
2,21790335,2140-03-11 13:44:00,33.000000,hour
3,24357541,2177-02-10 14:00:00,659.416667,hour
4,23255460,2172-11-20 08:06:00,56.216667,hour
...,...,...,...,...
24674,29960248,2146-12-09 14:23:00,4408.000000,min
24675,29962016,2135-10-30 11:45:00,13085.000000,min
24676,29974575,2131-03-03 19:54:00,5934.000000,min
24677,29987115,2148-02-19 10:00:00,2640.000000,min


In [7]:
import psutil

In [46]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

5237514240


In [47]:
import os

In [48]:
os.path.getsize('./chartevents.csv') 

29184776616

In [49]:
df_sample = pd.read_csv('./chartevents.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()

In [50]:
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

215517


In [51]:
iter_csv = pd.read_csv(
    'chartevents.csv',
    iterator=True,
    chunksize=my_chunk,
    dtype={'subject_id': int, 'hadm_id': int, 'stay_id': int, 
           'charttime' : str, 'storetime': str, 'itemid': int,
           'value': str, 'valuenum': float, 'valueuom': str, 'warning': int})

In [52]:
df_result = pd.concat(
    [chunk[(chunk['itemid'] == 220052)|(chunk['itemid'] == 225312)|(chunk['itemid'] == 220181)]
    for chunk in iter_csv])

In [58]:
ids = svents['hadm_id']
df_result = df_result[df_result['hadm_id'].isin(ids)]

In [59]:
sorted_chartdf = df_result.sort_values(['hadm_id', 'charttime'], ascending=[True, True])

In [61]:
sorted_chartdf[sorted_chartdf['hadm_id']==20000147]

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
7058535,14990224,20000147,37348463,2121-08-30 22:09:00,2121-08-30 22:12:00,220052,91,91.0,mmHg,0
7058543,14990224,20000147,37348463,2121-08-30 22:15:00,2121-08-30 22:33:00,220052,94,94.0,mmHg,0
7058573,14990224,20000147,37348463,2121-08-30 22:30:00,2121-08-30 22:33:00,220052,77,77.0,mmHg,0
7058590,14990224,20000147,37348463,2121-08-30 22:45:00,2121-08-30 23:10:00,220052,75,75.0,mmHg,0
7058596,14990224,20000147,37348463,2121-08-30 23:00:00,2121-08-30 23:10:00,220052,72,72.0,mmHg,0
7058627,14990224,20000147,37348463,2121-08-31 00:00:00,2121-08-31 00:25:00,220052,82,82.0,mmHg,0
7058657,14990224,20000147,37348463,2121-08-31 01:00:00,2121-08-31 01:06:00,220052,84,84.0,mmHg,0
7058672,14990224,20000147,37348463,2121-08-31 02:00:00,2121-08-31 02:15:00,220052,83,83.0,mmHg,0
7058698,14990224,20000147,37348463,2121-08-31 03:00:00,2121-08-31 03:04:00,220052,68,68.0,mmHg,0
7058708,14990224,20000147,37348463,2121-08-31 04:00:00,2121-08-31 04:06:00,220052,83,83.0,mmHg,0


In [62]:
sorted_chartdf.to_csv('blood_pressure.csv',index=False)

In [None]:
import pandas as pd
df = pd.read_csv('df_result.csv')

In [None]:
d_items = pd.read_csv('d_items.csv')

In [None]:
d_items[d_items['unitname']=='mmHg'].tail(60)