## Script to get clean feature table vefore passing to model

Cleaned feature table saved as 'clean_ft_table.csv'

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

#### get feature library

In [2]:
df = pd.read_feather('../data/processed/gender')

In [3]:
df_master = df.copy()

In [4]:
# select only useful columns for master dataframe
df_master=df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class',
                     'gender','admission_type','anchor_age']]

#### load each feature and merge with df_master on hadm_id

In [5]:
# all feature names are the same as the feature column so we can find the name of all features in
# the 'selected_features' directory, load each feature in turn, select only the relevant columns and merge these 
# with df_master on 'hadm_id'
dirList= os.listdir('../data/processed/selected_features')
filename = []

for fname in dirList:
    feature = np.append(filename, fname)
    feature_table = pd.read_feather('../data/processed/selected_features/'+fname)
    feature_columns = feature_table[['hadm_id',fname]]
    df_master = pd.merge(left = df_master, right = feature_columns, 
                     how='left', left_on = 'hadm_id',right_on = 'hadm_id')

In [6]:
df_master

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,bloodpressure,std_pulseox,std_heartrate,std_tidalvolume,height,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,28038802,2185-12-20 09:59:00,108.800000,0,M,EW EMER.,71,17.0,87.0,5.560918,...,107.0,2.906367,5.119038,38.183766,,568.0,36.555556,10.830303,88.0,0.274986
1,21790335,2140-03-11 13:44:00,33.000000,0,F,URGENT,34,13.0,78.0,0.447214,...,82.0,2.167948,2.489980,,,578.0,36.888889,1.923538,92.0,
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,...,102.0,2.000000,7.314369,272.236111,59.027559,0.0,37.500000,7.120393,94.0,
3,22188993,2145-11-04 18:40:00,143.666667,0,M,EW EMER.,68,27.0,95.0,0.726483,...,72.0,0.752773,4.230839,38.370996,74.007874,594.0,37.388889,3.669696,98.0,
4,21880799,2134-05-22 17:58:00,54.966667,0,M,EW EMER.,52,24.0,100.0,9.227289,...,65.0,0.000000,5.431390,9.899495,66.070866,500.0,36.555556,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,64.5,1.354006,2.321398,,61.011811,519.0,37.055556,3.761299,96.0,
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,71.0,0.000000,2.483277,,64.086614,308.0,36.888889,12.378938,100.0,
16401,29974575,2131-03-03 19:54:00,98.900000,1,M,EW EMER.,72,21.0,71.0,8.952254,...,73.0,1.549193,14.052283,,70.039370,447.0,36.888889,296.433466,100.0,
16402,29987115,2148-02-19 10:00:00,44.000000,1,F,URGENT,43,12.0,88.0,4.527693,...,122.0,0.547723,4.472136,94.752309,59.921260,594.0,36.722222,7.949843,99.0,


#### select only patients that were intubated for longer than 6 hours 

To get rid of all the following warnings, sHould change this to the df.where df.copy together? https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking

In [7]:
df6 = df_master.copy()
df6 = df_master[(df_master['time_on_vent']>6)]

In [8]:
df6

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,bloodpressure,std_pulseox,std_heartrate,std_tidalvolume,height,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,28038802,2185-12-20 09:59:00,108.800000,0,M,EW EMER.,71,17.0,87.0,5.560918,...,107.0,2.906367,5.119038,38.183766,,568.0,36.555556,10.830303,88.0,0.274986
1,21790335,2140-03-11 13:44:00,33.000000,0,F,URGENT,34,13.0,78.0,0.447214,...,82.0,2.167948,2.489980,,,578.0,36.888889,1.923538,92.0,
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,...,102.0,2.000000,7.314369,272.236111,59.027559,0.0,37.500000,7.120393,94.0,
3,22188993,2145-11-04 18:40:00,143.666667,0,M,EW EMER.,68,27.0,95.0,0.726483,...,72.0,0.752773,4.230839,38.370996,74.007874,594.0,37.388889,3.669696,98.0,
4,21880799,2134-05-22 17:58:00,54.966667,0,M,EW EMER.,52,24.0,100.0,9.227289,...,65.0,0.000000,5.431390,9.899495,66.070866,500.0,36.555556,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,64.5,1.354006,2.321398,,61.011811,519.0,37.055556,3.761299,96.0,
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,71.0,0.000000,2.483277,,64.086614,308.0,36.888889,12.378938,100.0,
16401,29974575,2131-03-03 19:54:00,98.900000,1,M,EW EMER.,72,21.0,71.0,8.952254,...,73.0,1.549193,14.052283,,70.039370,447.0,36.888889,296.433466,100.0,
16402,29987115,2148-02-19 10:00:00,44.000000,1,F,URGENT,43,12.0,88.0,4.527693,...,122.0,0.547723,4.472136,94.752309,59.921260,594.0,36.722222,7.949843,99.0,


In [9]:
df6.isnull().sum()

hadm_id                 0
endtime                 0
time_on_vent            0
re_intub_class          0
gender                  0
admission_type          0
anchor_age              0
spontrr                 1
heartrate               1
std_spontrr           111
weight               2949
bloodpressure           1
std_pulseox           103
std_heartrate          79
std_tidalvolume      9182
height               4108
tidalvolume            31
temp                 2081
std_bloodpressure     117
pulseox                 1
std_temp             9582
dtype: int64

In [10]:
df6.dropna(subset = ['std_bloodpressure'],inplace=True)
df6.dropna(subset = ['std_spontrr'],inplace=True)
df6.dropna(subset = ['temp'],inplace=True)
df6.dropna(subset = ['std_pulseox'],inplace=True)
df6.dropna(subset = ['std_heartrate'],inplace=True)
df6.dropna(subset = ['weight'],inplace=True)
df6.dropna(subset = ['height'],inplace=True)
df6.dropna(subset = ['tidalvolume'],inplace=True)

In [11]:
df6.isnull().sum()

hadm_id                 0
endtime                 0
time_on_vent            0
re_intub_class          0
gender                  0
admission_type          0
anchor_age              0
spontrr                 0
heartrate               0
std_spontrr             0
weight                  0
bloodpressure           0
std_pulseox             0
std_heartrate           0
std_tidalvolume      5187
height                  0
tidalvolume             0
temp                    0
std_bloodpressure       0
pulseox                 0
std_temp             4804
dtype: int64

In [12]:
df6.drop(['std_temp','std_tidalvolume'],axis=1,inplace=True)

In [13]:
df6[df6['re_intub_class']==1]

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,weight,bloodpressure,std_pulseox,std_heartrate,height,tidalvolume,temp,std_bloodpressure,pulseox
15197,20024229,2132-08-28 17:30:00,25.500000,1,M,OBSERVATION ADMIT,61,17.0,81.0,3.204164,196.7,80.0,2.280351,5.750362,70.039370,390.0,36.888889,6.260990,95.0
15198,20025172,2140-01-04 11:09:00,93.116667,1,F,OBSERVATION ADMIT,58,32.0,121.0,4.855042,106.0,106.0,1.861899,10.962056,55.952756,384.0,37.166667,12.722683,90.0
15199,20034762,2164-06-08 14:00:00,29.833333,1,F,URGENT,85,20.0,81.0,1.516575,166.1,59.0,1.095445,6.580274,61.011811,432.0,36.222222,5.167204,99.0
15200,20035700,2123-02-05 14:52:00,100.500000,1,M,OBSERVATION ADMIT,42,15.0,101.0,3.082207,163.9,93.0,0.816497,11.545562,68.055118,538.0,37.666667,10.788883,98.0
15203,20050336,2170-07-15 16:00:00,157.000000,1,F,EW EMER.,76,27.0,110.0,3.829708,118.8,94.0,3.286335,4.324350,66.070866,292.0,37.333333,10.114346,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16397,29947543,2144-04-02 15:17:00,134.333333,1,F,EW EMER.,68,15.0,108.0,10.264556,77.4,88.0,2.699206,8.264381,59.921260,320.0,37.611111,13.441230,96.0
16398,29951431,2173-10-10 09:32:00,14.783333,1,M,EW EMER.,65,22.0,75.0,2.326094,189.2,84.0,0.000000,3.352327,70.039370,498.0,37.722222,5.902481,100.0
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,139.0,64.5,1.354006,2.321398,61.011811,519.0,37.055556,3.761299,96.0
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,160.6,71.0,0.000000,2.483277,64.086614,308.0,36.888889,12.378938,100.0


In [14]:
df6['admission_type'].value_counts()

EW EMER.                       4423
URGENT                         2123
OBSERVATION ADMIT               890
SURGICAL SAME DAY ADMISSION     827
DIRECT EMER.                    359
ELECTIVE                        294
EU OBSERVATION                    1
DIRECT OBSERVATION                1
Name: admission_type, dtype: int64

In [15]:
df6.drop(df6.loc[df6['admission_type']=='DIRECT OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='EU OBSERVATION'].index, inplace=True)

In [24]:
df6.reset_index(inplace=True)

In [26]:
df6.drop('index',axis=1,inplace=True)

In [27]:
df6

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,weight,bloodpressure,std_pulseox,std_heartrate,height,tidalvolume,temp,std_bloodpressure,pulseox
0,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,167.4,102.0,2.000000,7.314369,59.027559,0.0,37.500000,7.120393,94.0
1,28691076,2149-08-01 15:31:00,25.983333,0,F,EW EMER.,72,19.0,79.0,3.718759,123.2,62.0,0.000000,1.971222,59.921260,387.0,36.444444,20.347548,100.0
2,23452785,2163-04-17 05:00:00,17.000000,0,M,EW EMER.,23,33.0,122.0,3.577709,264.0,101.0,1.788854,13.771952,70.933071,527.0,39.166667,6.350853,96.0
3,27434217,2130-05-20 16:14:00,580.233333,0,F,EW EMER.,81,24.0,63.0,1.788854,198.0,79.0,1.673320,4.324350,61.905512,437.0,36.833333,6.418723,96.0
4,26451703,2187-11-01 15:08:00,170.000000,0,F,URGENT,65,15.0,80.0,2.316607,205.9,105.0,0.577350,3.633180,61.905512,614.0,37.333333,4.505552,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8911,29947543,2144-04-02 15:17:00,134.333333,1,F,EW EMER.,68,15.0,108.0,10.264556,77.4,88.0,2.699206,8.264381,59.921260,320.0,37.611111,13.441230,96.0
8912,29951431,2173-10-10 09:32:00,14.783333,1,M,EW EMER.,65,22.0,75.0,2.326094,189.2,84.0,0.000000,3.352327,70.039370,498.0,37.722222,5.902481,100.0
8913,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,139.0,64.5,1.354006,2.321398,61.011811,519.0,37.055556,3.761299,96.0
8914,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,160.6,71.0,0.000000,2.483277,64.086614,308.0,36.888889,12.378938,100.0


In [28]:
df6.to_feather('../data/processed/clean_ft_table')

In [18]:
#adm_type=pd.get_dummies(df6['admission_type'],drop_first=True,prefix='adm_type')

don't include ethnicity in model (yet), also don't include admission location in model

In [19]:
#gender =pd.get_dummies(df6['gender'],drop_first=True,prefix = 'gender')

In [20]:
#df_for_model = pd.concat([df6,adm_type,gender],axis=1)

In [21]:
#df_for_model.drop(['hadm_id','admission_type','admission_location','ethnicity','gender'],axis=1,inplace=True)

In [22]:
#df_for_model

In [23]:
#df_for_model.to_csv('clean_ft_table.csv',index=False)