### Script for function to pull vitals data

This function loads chart events (specified by user-entered ID) into a dataframe, filters events by ventilataed patients, determines the last reading before extubation, takes the mean of this value for any duplicate readings (happens when multiple sensors used simultanrously) and returns the data as a column in the master dataframe

#### impot useful libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
import os

#### print virtual memory available

In [3]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

4834975744


#### print size of database we're pulling from

In [4]:
os.path.getsize('./chartevents.csv') 

29184776616

#### figure out chunk size for pandas dataframe reading

In [5]:
df_sample = pd.read_csv('./chartevents.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

215517


In [6]:
iter_csv = pd.read_csv(
    'chartevents.csv',
    iterator=True,
    chunksize=my_chunk,
    dtype={'subject_id': int, 'hadm_id': int, 'stay_id': int, 
           'charttime' : str, 'storetime': str, 'itemid': int,
           'value': str, 'valuenum': float, 'valueuom': str, 'warning': int})

In [7]:
df_result = pd.concat(
    [chunk[(chunk['itemid'] == 224685)|(chunk['itemid'] == 224686)|(chunk['itemid'] == 224421)]
    for chunk in iter_csv])

In [8]:
df_result

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
103,10003700,28623837,35053963,2165-04-24 06:00:00,2165-04-24 05:41:00,224685,426,426.0,mL,0
142,10003700,28623837,35053963,2165-04-24 07:00:00,2165-04-24 07:37:00,224685,451,451.0,mL,0
217,10004235,24181354,30276431,2196-02-24 17:00:00,2196-02-24 17:07:00,224685,605,605.0,mL,0
362,10004235,24181354,30276431,2196-02-24 20:00:00,2196-02-24 19:57:00,224685,596,596.0,mL,0
515,10004235,24181354,30276431,2196-02-25 01:00:00,2196-02-25 00:56:00,224685,881,881.0,mL,0
...,...,...,...,...,...,...,...,...,...,...
299920394,19999068,21606769,31096823,2161-08-28 05:00:00,2161-08-28 05:23:00,224421,579,579.0,mL,0
299920420,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224685,140,140.0,mL,0
299920421,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224686,465,465.0,mL,0
299920503,19999068,21606769,31096823,2161-08-28 11:00:00,2161-08-28 11:46:00,224685,151,151.0,mL,0


In [9]:
#df_result =  pd.read_csv('blood_pressure.csv')

In [10]:
svents = pd.read_csv('feature_table.csv')
ids = svents['hadm_id']
df_result = df_result[df_result['hadm_id'].isin(ids)]

In [11]:
# to convert fahrenheit to celcius: (F-32) x 5/9
# df_result['valuenum'] = np.where(df_result['itemid']==223761,(df_result['valuenum']-32)*(5/9),df_result['valuenum'])

In [12]:
df_result

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
103,10003700,28623837,35053963,2165-04-24 06:00:00,2165-04-24 05:41:00,224685,426,426.0,mL,0
142,10003700,28623837,35053963,2165-04-24 07:00:00,2165-04-24 07:37:00,224685,451,451.0,mL,0
217,10004235,24181354,30276431,2196-02-24 17:00:00,2196-02-24 17:07:00,224685,605,605.0,mL,0
362,10004235,24181354,30276431,2196-02-24 20:00:00,2196-02-24 19:57:00,224685,596,596.0,mL,0
515,10004235,24181354,30276431,2196-02-25 01:00:00,2196-02-25 00:56:00,224685,881,881.0,mL,0
...,...,...,...,...,...,...,...,...,...,...
299920394,19999068,21606769,31096823,2161-08-28 05:00:00,2161-08-28 05:23:00,224421,579,579.0,mL,0
299920420,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224685,140,140.0,mL,0
299920421,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224686,465,465.0,mL,0
299920503,19999068,21606769,31096823,2161-08-28 11:00:00,2161-08-28 11:46:00,224685,151,151.0,mL,0


In [13]:
merged_inner = pd.merge(left=df_result, right=svents, left_on='hadm_id', right_on='hadm_id')

In [14]:
merged_inner

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value_x,valuenum,valueuom_x,warning,...,spontRR,stdABP,meanABP,stdSpontRR,pulseox,stdPulseox,temp,stdTemp,heartRate,stdHeartRate
0,10003700,28623837,35053963,2165-04-24 06:00:00,2165-04-24 05:41:00,224685,426,426.0,mL,0,...,14.0,14.361407,82.0,5.715476,100.0,0.000000,36.555556,0.078567,56.0,4.086563
1,10003700,28623837,35053963,2165-04-24 07:00:00,2165-04-24 07:37:00,224685,451,451.0,mL,0,...,14.0,14.361407,82.0,5.715476,100.0,0.000000,36.555556,0.078567,56.0,4.086563
2,10004235,24181354,30276431,2196-02-24 17:00:00,2196-02-24 17:07:00,224685,605,605.0,mL,0,...,14.0,8.648699,88.0,1.195229,100.0,0.816497,36.833333,,112.0,4.215052
3,10004235,24181354,30276431,2196-02-24 20:00:00,2196-02-24 19:57:00,224685,596,596.0,mL,0,...,14.0,8.648699,88.0,1.195229,100.0,0.816497,36.833333,,112.0,4.215052
4,10004235,24181354,30276431,2196-02-25 01:00:00,2196-02-25 00:56:00,224685,881,881.0,mL,0,...,14.0,8.648699,88.0,1.195229,100.0,0.816497,36.833333,,112.0,4.215052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339723,19999068,21606769,31096823,2161-08-28 05:00:00,2161-08-28 05:23:00,224421,579,579.0,mL,0,...,12.0,10.094553,94.0,4.733996,100.0,0.786796,37.166667,0.196419,81.0,10.526610
1339724,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224685,140,140.0,mL,0,...,12.0,10.094553,94.0,4.733996,100.0,0.786796,37.166667,0.196419,81.0,10.526610
1339725,19999068,21606769,31096823,2161-08-28 07:00:00,2161-08-28 07:48:00,224686,465,465.0,mL,0,...,12.0,10.094553,94.0,4.733996,100.0,0.786796,37.166667,0.196419,81.0,10.526610
1339726,19999068,21606769,31096823,2161-08-28 11:00:00,2161-08-28 11:46:00,224685,151,151.0,mL,0,...,12.0,10.094553,94.0,4.733996,100.0,0.786796,37.166667,0.196419,81.0,10.526610


In [15]:
merged_inner['endtime'] = pd.to_datetime(merged_inner['endtime'])
merged_inner['charttime'] = pd.to_datetime(merged_inner['charttime'])
merged_inner['time_diff']= merged_inner['endtime']-merged_inner['charttime']

In [16]:
merged_inner['timediffhour'] = merged_inner['time_diff'] / np.timedelta64(1, 'h') # convert time to hours
merged_inner_pre = merged_inner[merged_inner['timediffhour']>0]

In [17]:
merged_inner_pre['min']=merged_inner_pre.groupby('hadm_id')['timediffhour'].transform('min') # find the last reading 
            # time before extubation occured and place in a new channel ('min')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_inner_pre['min']=merged_inner_pre.groupby('hadm_id')['timediffhour'].transform('min') # find the last reading


In [18]:
merged_inner_pre.tail(60)

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value_x,valuenum,valueuom_x,warning,...,stdSpontRR,pulseox,stdPulseox,temp,stdTemp,heartRate,stdHeartRate,time_diff,timediffhour,min
1339666,19970491,29133530,38263194,2132-04-26 01:00:00,2132-04-26 01:16:00,224686,615,615.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,2 days 11:19:00,59.316667,32.316667
1339667,19970491,29133530,38263194,2132-04-26 04:00:00,2132-04-26 04:45:00,224685,502,502.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,2 days 08:19:00,56.316667,32.316667
1339668,19970491,29133530,38263194,2132-04-26 04:00:00,2132-04-26 04:45:00,224686,544,544.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,2 days 08:19:00,56.316667,32.316667
1339669,19970491,29133530,38263194,2132-04-26 07:30:00,2132-04-26 07:35:00,224685,524,524.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,2 days 04:49:00,52.816667,32.316667
1339670,19970491,29133530,38263194,2132-04-26 12:00:00,2132-04-26 12:16:00,224685,494,494.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,2 days 00:19:00,48.316667,32.316667
1339671,19970491,29133530,38263194,2132-04-26 16:00:00,2132-04-26 15:54:00,224685,405,405.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,1 days 20:19:00,44.316667,32.316667
1339672,19970491,29133530,38263194,2132-04-26 16:00:00,2132-04-26 15:54:00,224686,405,405.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,1 days 20:19:00,44.316667,32.316667
1339673,19970491,29133530,38263194,2132-04-26 16:30:00,2132-04-26 16:27:00,224685,469,469.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,1 days 19:49:00,43.816667,32.316667
1339674,19970491,29133530,38263194,2132-04-26 19:00:00,2132-04-26 19:28:00,224685,487,487.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,1 days 17:19:00,41.316667,32.316667
1339675,19970491,29133530,38263194,2132-04-26 19:00:00,2132-04-26 19:28:00,224686,489,489.0,mL,0,...,3.962323,98.0,0.957427,36.222222,,58.0,12.62141,1 days 17:19:00,41.316667,32.316667


In [19]:
last_value_pre = merged_inner_pre[merged_inner_pre['timediffhour'] == merged_inner_pre['min']] 
            # select only rows where 'timediffhour' is the same as the minimum time ('min')

In [20]:
avg_dupValues = last_value_pre.groupby('hadm_id')['valuenum'].mean() # where duplicate values exist,take the mean
meanBP = avg_dupValues.reset_index()

In [21]:
meanBP.columns = (['hadm_id','tidalVol'])

In [22]:
meanBP

Unnamed: 0,hadm_id,tidalVol
0,20000147,413.0
1,20001305,323.0
2,20001361,871.0
3,20001687,388.0
4,20002267,424.0
...,...,...
24565,29998113,402.0
24566,29998115,390.5
24567,29998399,365.0
24568,29999098,495.0


In [23]:
#num_ventilations = last_value_pre['hadm_id'].value_counts()
#svdf = last_value_pre[last_value_pre['hadm_id'].isin(num_ventilations.index[num_ventilations ==2])]
#sns.countplot(svdf['valuenum'])

In [24]:
#new_df

In [25]:
#new_df.to_csv('feature_table.csv',index= False)

### Getting variability over last 6 hours

In [26]:
merged_inner_var = merged_inner[(merged_inner['timediffhour']>0)&(merged_inner['timediffhour']<6)]

varBP = merged_inner_var.groupby('hadm_id')['valuenum'].std()
varBP = varBP.reset_index()
varBP.columns = (['hadm_id','stdTidalVol'])

new_df = pd.merge(left = svents, right=meanBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

new_df = pd.merge(left = new_df, right=varBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

In [27]:
#new_df.drop(['temp','stdTemp'],axis=1,inplace=True)

new_df.to_csv('feature_table.csv',index= False)