## Script to get clean feature table vefore passing to model

Cleaned feature table saved as 'clean_ft_table.csv'

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

#### get feature library

In [2]:
df = pd.read_feather('../data/processed/gender')

In [3]:
df_master = df.copy()

In [4]:
# select only useful columns for master dataframe
df_master=df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class',
                     'gender','admission_type','anchor_age']]

#### load each feature and merge with df_master on hadm_id

In [5]:
# all feature names are the same as the feature column so we can find the name of all features in
# the 'selected_features' directory, load each feature in turn, select only the relevant columns and merge these 
# with df_master on 'hadm_id'
dirList= os.listdir('../data/processed/selected_features')
filename = []

for fname in dirList:
    feature = np.append(filename, fname)
    feature_table = pd.read_feather('../data/processed/selected_features/'+fname)
    feature_columns = feature_table[['hadm_id',fname]]
    df_master = pd.merge(left = df_master, right = feature_columns, 
                     how='left', left_on = 'hadm_id',right_on = 'hadm_id')

In [6]:
df_master

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,height,lactic_acid,bnp,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,28038802,2185-12-20 09:59:00,108.800000,0,M,EW EMER.,71,17.0,87.0,5.560918,...,,0.7,,12.6,6.6,568.0,36.555556,10.830303,88.0,0.274986
1,21790335,2140-03-11 13:44:00,33.000000,0,F,URGENT,34,13.0,78.0,0.447214,...,,2.9,,9.1,10.5,578.0,36.888889,1.923538,92.0,
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,...,59.027559,0.8,48581.0,8.5,8.4,0.0,37.500000,7.120393,94.0,
3,22188993,2145-11-04 18:40:00,143.666667,0,M,EW EMER.,68,27.0,95.0,0.726483,...,74.007874,1.8,,11.0,5.4,594.0,37.388889,3.669696,98.0,
4,21880799,2134-05-22 17:58:00,54.966667,0,M,EW EMER.,52,24.0,100.0,9.227289,...,66.070866,1.0,,7.8,8.4,500.0,36.555556,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,61.011811,1.0,,10.3,18.6,519.0,37.055556,3.761299,96.0,
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,64.086614,1.4,,9.0,10.7,308.0,36.888889,12.378938,100.0,
16401,29974575,2131-03-03 19:54:00,98.900000,1,M,EW EMER.,72,21.0,71.0,8.952254,...,70.039370,0.8,,10.0,6.0,447.0,36.888889,296.433466,100.0,
16402,29987115,2148-02-19 10:00:00,44.000000,1,F,URGENT,43,12.0,88.0,4.527693,...,59.921260,1.1,,9.0,6.5,594.0,36.722222,7.949843,99.0,


#### select only patients that were intubated for longer than 6 hours 

To get rid of all the following warnings, sHould change this to the df.where df.copy together? https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking

In [7]:
df6 = df_master.copy()
df6 = df_master[(df_master['time_on_vent']>6)]

In [22]:
df6

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
6,28691076,2149-08-01 15:31:00,25.983333,0,F,EW EMER.,72,19.0,79.0,3.718759,...,2.2,14.0,59.921260,8.1,9.2,12.2,387.0,36.444444,20.347548,100.0
7,23452785,2163-04-17 05:00:00,17.000000,0,M,EW EMER.,23,33.0,122.0,3.577709,...,0.7,23.0,70.933071,0.9,12.2,8.3,527.0,39.166667,6.350853,96.0
8,27434217,2130-05-20 16:14:00,580.233333,0,F,EW EMER.,81,24.0,63.0,1.788854,...,0.8,29.0,61.905512,1.0,9.4,16.0,437.0,36.833333,6.418723,96.0
12,23481036,2112-05-25 09:52:00,83.866667,0,F,EW EMER.,61,18.0,73.0,8.455767,...,0.9,23.0,66.070866,1.2,8.4,24.3,556.0,36.444444,7.366591,100.0
13,27720138,2122-04-07 11:21:00,160.350000,0,F,EW EMER.,62,33.0,96.0,5.962682,...,0.8,12.0,57.043307,1.7,9.5,11.8,417.0,36.944444,2.190890,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16395,29925308,2128-07-30 06:02:00,10.416667,1,M,EW EMER.,85,15.0,71.0,1.870829,...,1.4,18.0,66.964567,1.0,8.1,10.0,498.0,36.888889,3.817254,100.0
16397,29947543,2144-04-02 15:17:00,134.333333,1,F,EW EMER.,68,15.0,108.0,10.264556,...,0.5,13.0,59.921260,1.0,9.8,20.4,320.0,37.611111,13.441230,96.0
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,2.9,62.0,61.011811,1.0,10.3,18.6,519.0,37.055556,3.761299,96.0
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,0.5,13.0,64.086614,1.4,9.0,10.7,308.0,36.888889,12.378938,100.0


In [28]:
df6.isnull().sum()

hadm_id              0
endtime              0
time_on_vent         0
re_intub_class       0
gender               0
admission_type       0
anchor_age           0
spontrr              0
heartrate            0
std_spontrr          0
weight               0
sodium               0
abg_po2              0
abg_ph               0
hco3                 0
abg_pco2             0
bloodpressure        0
std_pulseox          0
std_heartrate        0
creatinine           0
bun                  0
height               0
lactic_acid          0
hemoglobin           0
wbg                  0
tidalvolume          0
temp                 0
std_bloodpressure    0
pulseox              0
dtype: int64

In [9]:
df6.dropna(subset = ['std_bloodpressure'],inplace=True)
df6.dropna(subset = ['std_spontrr'],inplace=True)
df6.dropna(subset = ['temp'],inplace=True)
df6.dropna(subset = ['std_pulseox'],inplace=True)
df6.dropna(subset = ['std_heartrate'],inplace=True)
df6.dropna(subset = ['weight'],inplace=True)
df6.dropna(subset = ['height'],inplace=True)
df6.dropna(subset = ['tidalvolume'],inplace=True)

In [27]:
df6.dropna(subset = ['abg_po2'],inplace=True)
df6.dropna(subset = ['lactic_acid'],inplace=True)
df6.dropna(subset = ['wbg'],inplace=True)
df6.dropna(subset = ['hco3'],inplace=True)
df6.dropna(subset = ['bun'],inplace=True)

In [None]:
df6.isnull().sum()

In [11]:
df6.drop(['std_temp','std_tidalvolume','bnp','vbg_ph','vbg_pco2'],axis=1,inplace=True)

In [None]:
df6[df6['re_intub_class']==1]

In [29]:
df6.describe()

Unnamed: 0,hadm_id,time_on_vent,re_intub_class,anchor_age,spontrr,heartrate,std_spontrr,weight,sodium,abg_po2,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
count,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,...,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0,6562.0
mean,24969180.0,108.040221,0.103017,62.015696,19.447831,86.412831,4.444946,186.123551,139.482094,427.499086,...,1.372943,28.216855,66.567084,306.226504,9.762024,12.256099,479.347136,36.983793,8.906753,97.598293
std,2876341.0,143.096898,0.304005,16.300474,5.854213,17.261708,3.829302,55.646766,4.789115,17454.687614,...,1.314319,22.229054,5.432982,17456.723191,1.716487,7.233527,148.166916,1.721733,8.578855,2.955987
min,20001360.0,6.016667,0.0,18.0,0.0,0.0,0.0,56.0,115.0,25.0,...,0.0,1.0,0.0,0.3,0.0,0.1,101.0,-17.777778,0.0,0.0
25%,22488250.0,21.483333,0.0,52.0,15.5,74.0,2.439913,149.6,136.0,91.0,...,0.7,14.0,64.086614,1.0,8.5,8.2,380.0,36.722222,5.010614,96.0
50%,24961880.0,60.141667,0.0,63.0,19.0,85.0,3.777124,176.4,139.0,112.0,...,0.9,21.0,66.964567,1.3,9.5,11.1,457.0,37.0,7.27782,98.0
75%,27465200.0,140.516667,0.0,74.0,23.0,97.0,5.951815,213.4,142.0,142.0,...,1.5,35.0,70.03937,1.7,10.8,14.7,553.0,37.388889,10.490075,100.0
max,29999620.0,2389.733333,1.0,91.0,66.0,173.0,160.603722,605.0,166.0,999999.0,...,13.6,276.0,175.098425,999999.0,17.7,250.2,1602.0,40.0,296.433466,100.0


#### tidal volume should be below 2000, weight shoule be above 80

In [14]:
df6.drop(df6[(df6['tidalvolume'] > 2000)|(df6['tidalvolume'] < 100)].index, inplace = True) 

In [15]:
df6.drop(df6[df6['weight'] < 50].index, inplace = True)

In [None]:
df6['admission_type'].value_counts()

In [16]:
df6.drop(df6.loc[df6['admission_type']=='DIRECT OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='EU OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='SURGICAL SAME DAY ADMISSION'].index, inplace=True)

In [32]:
df6.reset_index(inplace=True)

In [33]:
df6.drop('index',axis=1,inplace=True)

In [34]:
df6

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
0,28691076,2149-08-01 15:31:00,25.983333,0,F,EW EMER.,72,19.0,79.0,3.718759,...,2.2,14.0,59.921260,8.1,9.2,12.2,387.0,36.444444,20.347548,100.0
1,23452785,2163-04-17 05:00:00,17.000000,0,M,EW EMER.,23,33.0,122.0,3.577709,...,0.7,23.0,70.933071,0.9,12.2,8.3,527.0,39.166667,6.350853,96.0
2,27434217,2130-05-20 16:14:00,580.233333,0,F,EW EMER.,81,24.0,63.0,1.788854,...,0.8,29.0,61.905512,1.0,9.4,16.0,437.0,36.833333,6.418723,96.0
3,23481036,2112-05-25 09:52:00,83.866667,0,F,EW EMER.,61,18.0,73.0,8.455767,...,0.9,23.0,66.070866,1.2,8.4,24.3,556.0,36.444444,7.366591,100.0
4,27720138,2122-04-07 11:21:00,160.350000,0,F,EW EMER.,62,33.0,96.0,5.962682,...,0.8,12.0,57.043307,1.7,9.5,11.8,417.0,36.944444,2.190890,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6557,29925308,2128-07-30 06:02:00,10.416667,1,M,EW EMER.,85,15.0,71.0,1.870829,...,1.4,18.0,66.964567,1.0,8.1,10.0,498.0,36.888889,3.817254,100.0
6558,29947543,2144-04-02 15:17:00,134.333333,1,F,EW EMER.,68,15.0,108.0,10.264556,...,0.5,13.0,59.921260,1.0,9.8,20.4,320.0,37.611111,13.441230,96.0
6559,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,2.9,62.0,61.011811,1.0,10.3,18.6,519.0,37.055556,3.761299,96.0
6560,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,0.5,13.0,64.086614,1.4,9.0,10.7,308.0,36.888889,12.378938,100.0


In [35]:
df6.to_feather('../data/processed/clean_ft_table_labs')

In [None]:
#adm_type=pd.get_dummies(df6['admission_type'],drop_first=True,prefix='adm_type')

don't include ethnicity in model (yet), also don't include admission location in model

In [None]:
#gender =pd.get_dummies(df6['gender'],drop_first=True,prefix = 'gender')

In [None]:
#df_for_model = pd.concat([df6,adm_type,gender],axis=1)

In [None]:
#df_for_model.drop(['hadm_id','admission_type','admission_location','ethnicity','gender'],axis=1,inplace=True)

In [None]:
#df_for_model

In [None]:
#df_for_model.to_csv('clean_ft_table.csv',index=False)