### Script for function to pull vitals data

This function loads chart events (specified by user-entered ID) into a dataframe, filters events by ventilataed patients, determines the last reading before extubation, takes the mean of this value for any duplicate readings (happens when multiple sensors used simultanrously) and returns the data as a column in the master dataframe

#### impot useful libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
import os

#### print virtual memory available

In [8]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

3545346048


#### print size of database we're pulling from

In [3]:
os.path.getsize('../data/raw/chartevents.csv') 

29184776616

#### figure out chunk size for pandas dataframe reading

In [14]:
df_sample = pd.read_csv('../data/raw/chartevents.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

215517


4) create dataframe structure and set chunksize for iterating data into dataframe

In [5]:
iter_csv = pd.read_csv(
    '../data/raw/chartevents.csv',
    iterator=True,
    chunksize=my_chunk,
    dtype={'subject_id': int, 'hadm_id': int, 'stay_id': int, 
           'charttime' : str, 'storetime': str, 'itemid': int,
           'value': str, 'valuenum': float, 'valueuom': str, 'warning': int})

In [15]:
iter_csv

<pandas.io.parsers.TextFileReader at 0x7f26ccea2760>

5) get chart events data

In [6]:
df_result = pd.concat(
    [chunk[(chunk['itemid'] == 220052)|(chunk['itemid'] == 225312)|(chunk['itemid'] == 220181)]
    for chunk in iter_csv])

In [7]:
df_result

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
64,10003700,28623837,35053963,2165-04-24 05:28:00,2165-04-24 05:37:00,220181,110,110.0,mmHg,0
90,10003700,28623837,35053963,2165-04-24 06:00:00,2165-04-24 06:09:00,220181,88,88.0,mmHg,0
125,10003700,28623837,35053963,2165-04-24 07:00:00,2165-04-24 07:51:00,220181,109,109.0,mmHg,0
175,10003700,28623837,35053963,2165-04-24 08:00:00,2165-04-24 08:19:00,220181,82,82.0,mmHg,0
201,10004235,24181354,30276431,2196-02-24 16:41:00,2196-02-24 17:48:00,220052,82,82.0,mmHg,0
...,...,...,...,...,...,...,...,...,...,...
299921044,19999068,21606769,31096823,2161-08-30 17:00:00,2161-08-30 17:31:00,220181,105,105.0,mmHg,0
299921052,19999068,21606769,31096823,2161-08-30 18:00:00,2161-08-30 18:55:00,220181,89,89.0,mmHg,0
299921057,19999068,21606769,31096823,2161-08-30 19:00:00,2161-08-30 19:41:00,220181,80,80.0,mmHg,0
299921068,19999068,21606769,31096823,2161-08-30 20:00:00,2161-08-30 20:05:00,220181,83,83.0,mmHg,0


In [10]:
svents = pd.read_csv('../data/processed/sample_vents.csv')
ids = svents['hadm_id']
df_result = df_result[df_result['hadm_id'].isin(ids)]

In [None]:
# to convert fahrenheit to celcius: (F-32) x 5/9
# df_result['valuenum'] = np.where(df_result['itemid']==223761,(df_result['valuenum']-32)*(5/9),df_result['valuenum'])

In [11]:
df_result

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
201,10004235,24181354,30276431,2196-02-24 16:41:00,2196-02-24 17:48:00,220052,82,82.0,mmHg,0
205,10004235,24181354,30276431,2196-02-24 17:00:00,2196-02-24 17:48:00,220052,83,83.0,mmHg,0
247,10004235,24181354,30276431,2196-02-24 17:16:00,2196-02-24 17:17:00,220052,93,93.0,mmHg,0
289,10004235,24181354,30276431,2196-02-24 17:48:00,2196-02-24 17:48:00,220052,88,88.0,mmHg,0
300,10004235,24181354,30276431,2196-02-24 18:00:00,2196-02-24 18:14:00,220052,79,79.0,mmHg,0
...,...,...,...,...,...,...,...,...,...,...
299921044,19999068,21606769,31096823,2161-08-30 17:00:00,2161-08-30 17:31:00,220181,105,105.0,mmHg,0
299921052,19999068,21606769,31096823,2161-08-30 18:00:00,2161-08-30 18:55:00,220181,89,89.0,mmHg,0
299921057,19999068,21606769,31096823,2161-08-30 19:00:00,2161-08-30 19:41:00,220181,80,80.0,mmHg,0
299921068,19999068,21606769,31096823,2161-08-30 20:00:00,2161-08-30 20:05:00,220181,83,83.0,mmHg,0


In [None]:
merged_inner = pd.merge(left=df_result, right=svents, left_on='hadm_id', right_on='hadm_id')

In [None]:
#merged_inner = merged_inner[merged_inner['itemid']==224685]
merged_inner

In [None]:
merged_inner['endtime'] = pd.to_datetime(merged_inner['endtime'])
merged_inner['charttime'] = pd.to_datetime(merged_inner['charttime'])
merged_inner['time_diff']= merged_inner['endtime']-merged_inner['charttime']

In [None]:
merged_inner['timediffhour'] = merged_inner['time_diff'] / np.timedelta64(1, 'h') # convert time to hours
merged_inner_pre = merged_inner[merged_inner['timediffhour']>0]

In [None]:
merged_inner_pre['min']=merged_inner_pre.groupby('hadm_id')['timediffhour'].transform('min') # find the last reading 
            # time before extubation occured and place in a new channel ('min')

In [None]:
merged_inner_pre.tail(60)

In [None]:
last_value_pre = merged_inner_pre[merged_inner_pre['timediffhour'] == merged_inner_pre['min']] 
            # select only rows where 'timediffhour' is the same as the minimum time ('min')

In [None]:
avg_dupValues = last_value_pre.groupby('hadm_id')['valuenum'].mean() # where duplicate values exist,take the mean
meanBP = avg_dupValues.reset_index()

In [None]:
meanBP.columns = (['hadm_id','tidalVol'])

In [None]:
meanBP.isnull().sum()

In [None]:
#num_ventilations = last_value_pre['hadm_id'].value_counts()
#svdf = last_value_pre[last_value_pre['hadm_id'].isin(num_ventilations.index[num_ventilations ==2])]
#sns.countplot(svdf['valuenum'])

In [None]:
#new_df

In [None]:
#new_df.to_csv('feature_table.csv',index= False)

### Getting variability over last 6 hours

In [None]:
merged_inner_var = merged_inner[(merged_inner['timediffhour']>0)&(merged_inner['timediffhour']<6)]

varBP = merged_inner_var.groupby('hadm_id')['valuenum'].std()
varBP = varBP.reset_index()
varBP.columns = (['hadm_id','stdTidalVol'])

In [None]:
#plt.hist(merged_inner_var['timediffhour'])

new_df = pd.merge(left = svents, right=meanBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

new_df = pd.merge(left = new_df, right=varBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

new_df.drop(['temp','stdTemp'],axis=1,inplace=True)

new_df.to_csv('feature_table.csv',index= False)