### Script for function to pull vitals data

This function loads chart events (specified by user-entered ID) into a dataframe, filters events by ventilataed patients, determines the last reading before extubation, takes the mean of this value for any duplicate readings (happens when multiple sensors used simultanrously) and returns the data as a column in the master dataframe

#### impot useful libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# to convert fahrenheit to celcius: (F-32) x 5/9
# df_result['valuenum'] = np.where(df_result['itemid']==223761,(df_result['valuenum']-32)*(5/9),df_result['valuenum'])

#### user-defined variables that change every time

In [3]:
event = "tidalvolume"
data_dir = "../data/feathered/"
export_dir = "../data/processed/"

datafile = data_dir+event
mean_export_file = export_dir+event
std_export_file = export_dir+"std_"+event
meanColumns = ['hadm_id',event]
stdColumns = ['hadm_id','std_'+event]

In [4]:
df = pd.read_feather(datafile)
df['valuenum']= pd.to_numeric(df['valuenum'])

In [5]:
df

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning,endtime,re_intub_class,time_on_vent
0,10004235,24181354,30276431,2196-02-26 10:00:00,2196-02-26 10:05:00,223761,98.5,98.5,°F,0,2196-02-27 16:28:00,0,71.600000
1,10004235,24181354,30276431,2196-02-26 16:00:00,2196-02-26 16:06:00,223761,98.2,98.2,°F,0,2196-02-27 16:28:00,0,71.600000
2,10004235,24181354,30276431,2196-02-27 08:00:00,2196-02-27 08:04:00,223761,98.3,98.3,°F,0,2196-02-27 16:28:00,0,71.600000
3,10004235,24181354,30276431,2196-02-28 07:00:00,2196-02-28 07:29:00,223761,98.3,98.3,°F,0,2196-02-27 16:28:00,0,71.600000
4,10004235,24181354,30276431,2196-02-28 11:00:00,2196-02-28 11:20:00,223761,98.3,98.3,°F,0,2196-02-27 16:28:00,0,71.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
604194,19999068,21606769,31096823,2161-08-29 13:00:00,2161-08-29 13:05:00,223761,97.8,97.8,°F,0,2161-08-28 13:36:00,0,70.016667
604195,19999068,21606769,31096823,2161-08-29 17:00:00,2161-08-29 17:23:00,223761,98.3,98.3,°F,0,2161-08-28 13:36:00,0,70.016667
604196,19999068,21606769,31096823,2161-08-29 19:00:00,2161-08-29 19:44:00,223761,98.5,98.5,°F,0,2161-08-28 13:36:00,0,70.016667
604197,19999068,21606769,31096823,2161-08-30 08:00:00,2161-08-30 08:03:00,223761,99.3,99.3,°F,0,2161-08-28 13:36:00,0,70.016667


#### Convert times to pandas datetime object, find the time difference between each event and the time of extubation ('endtime') and find the last time an event occurred before extubation

In [6]:
df['endtime'] = pd.to_datetime(df['endtime'])
df['charttime'] = pd.to_datetime(df['charttime'])
df['time_diff']= df['endtime']-df['charttime']

In [7]:
df['timediffhour'] = df['time_diff'] / np.timedelta64(1, 'h') # convert time to hours
df_pre = df.copy()
df_pre = df[df['timediffhour']>0 & (df['timediffhour']<2)]

In [8]:
df_pre['min']=df_pre.groupby('hadm_id')['timediffhour'].transform('min') # find the last reading 
            # time before extubation occured and place in a new channel ('min')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pre['min']=df_pre.groupby('hadm_id')['timediffhour'].transform('min') # find the last reading


In [9]:
last_value_pre = df_pre[df_pre['timediffhour'] == df_pre['min']] 
            # select only rows where 'timediffhour' is the same as the minimum time ('min')

In [10]:
last_value_pre

Unnamed: 0,subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning,endtime,re_intub_class,time_on_vent,time_diff,timediffhour,min
2,10004235,24181354,30276431,2196-02-27 08:00:00,2196-02-27 08:04:00,223761,98.3,98.3,°F,0,2196-02-27 16:28:00,0,71.600000,0 days 08:28:00,8.466667,8.466667
30,10019003,27525946,30460871,2153-04-14 17:00:00,2153-04-14 17:28:00,223761,98.9,98.9,°F,0,2153-04-14 17:50:00,0,22.066667,0 days 00:50:00,0.833333,0.833333
159,10035631,29276678,35275147,2116-03-09 16:00:00,2116-03-09 17:13:00,223761,98.5,98.5,°F,0,2116-03-09 18:11:00,0,217.983333,0 days 02:11:00,2.183333,2.183333
202,10035747,27083519,39236053,2126-05-15 12:14:00,2126-05-15 12:14:00,223761,99.7,99.7,°F,0,2126-05-15 13:00:00,0,143.000000,0 days 00:46:00,0.766667,0.766667
218,10038933,25129047,37828970,2148-09-11 12:00:00,2148-09-11 12:38:00,223761,99,99.0,°F,0,2148-09-11 14:00:00,0,24.216667,0 days 02:00:00,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604071,19965610,28545396,30691619,2125-09-28 12:00:00,2125-09-28 14:42:00,223761,98.9,98.9,°F,0,2125-09-28 12:51:00,0,52.733333,0 days 00:51:00,0.850000,0.850000
604093,19970491,22119205,30220330,2131-02-13 07:00:00,2131-02-13 07:34:00,223761,98.1,98.1,°F,0,2131-02-13 09:17:00,0,48.733333,0 days 02:17:00,2.283333,2.283333
604110,19970491,20897702,32207420,2131-05-11 20:00:00,2131-05-11 20:06:00,223761,99.4,99.4,°F,0,2131-05-11 21:22:00,0,25.366667,0 days 01:22:00,1.366667,1.366667
604152,19970491,29133530,38263194,2132-04-28 08:00:00,2132-04-28 08:15:00,223761,97.2,97.2,°F,0,2132-04-28 12:19:00,0,90.316667,0 days 04:19:00,4.316667,4.316667


In [11]:
avg_dupValues = last_value_pre.groupby('hadm_id')['valuenum'].mean() # where duplicate values exist,take the mean
meanBP = avg_dupValues.reset_index()
meanBP.columns = (meanColumns)

In [12]:
svents = pd.read_csv(export_dir +'sample_vents.csv')
mean_df = pd.merge(left = svents, right=meanBP, how = 'left', left_on='hadm_id', right_on='hadm_id')   

In [13]:
mean_df.to_feather(mean_export_file)

In [14]:
#num_ventilations = last_value_pre['hadm_id'].value_counts()
#svdf = last_value_pre[last_value_pre['hadm_id'].isin(num_ventilations.index[num_ventilations ==2])]
#sns.countplot(svdf['valuenum'])

### Getting variability over last 6 hours

In [15]:
df_var = df.copy()
df_var = df[(df['timediffhour']>0)&(df['timediffhour']<6)]

varBP = df_var.groupby('hadm_id')['valuenum'].std()
varBP = varBP.reset_index()
varBP.columns = (stdColumns)

In [16]:
#plt.hist(merged_inner_var['timediffhour'])

In [17]:
std_df = pd.merge(left = svents, right=varBP, how = 'left', left_on='hadm_id', right_on='hadm_id')

In [18]:
std_df.to_feather(std_export_file)

new_df = pd.merge(left = svents, right=meanBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

new_df = pd.merge(left = new_df, right=varBP, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df

new_df.drop(['temp','stdTemp'],axis=1,inplace=True)

new_df.to_csv('feature_table.csv',index= False)