## Script to get clean feature table vefore passing to model

Cleaned feature table saved as 'clean_ft_table.csv'

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

#### get feature library

In [2]:
df = pd.read_feather('../data/processed/gender')

In [3]:
copd = pd.read_feather('../data/feathered/copd')

In [4]:
copd

Unnamed: 0,hadm_id,seq_num,icd_code
0,29500472,8,496
1,29912012,18,496
2,22856743,19,496
3,29794854,12,496
4,27323423,13,496
...,...,...,...
1812,21353701,6,J440
1813,23378172,4,J440
1814,23378172,9,J441
1815,20397401,16,J449


In [5]:
df_master = df.copy()

In [6]:
# select only useful columns for master dataframe
df_master=df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class',
                     'gender','admission_type','anchor_age']]

#### load each feature and merge with df_master on hadm_id

In [7]:
# all feature names are the same as the feature column so we can find the name of all features in
# the 'selected_features' directory, load each feature in turn, select only the relevant columns and merge these 
# with df_master on 'hadm_id'
dirList= os.listdir('../data/processed/selected_features')
filename = []

for fname in dirList:
    feature = np.append(filename, fname)
    feature_table = pd.read_feather('../data/processed/selected_features/'+fname)
    feature_columns = feature_table[['hadm_id',fname]]
    df_master = pd.merge(left = df_master, right = feature_columns, 
                     how='left', left_on = 'hadm_id',right_on = 'hadm_id')

In [8]:
df_master

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,height,lactic_acid,bnp,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,28038802,2185-12-20 09:59:00,108.800000,0,M,EW EMER.,71,17.0,87.0,5.560918,...,,0.7,,12.6,6.6,568.0,36.555556,10.830303,88.0,0.274986
1,21790335,2140-03-11 13:44:00,33.000000,0,F,URGENT,34,13.0,78.0,0.447214,...,,2.9,,9.1,10.5,578.0,36.888889,1.923538,92.0,
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,...,59.027559,0.8,48581.0,8.5,8.4,0.0,37.500000,7.120393,94.0,
3,22188993,2145-11-04 18:40:00,143.666667,0,M,EW EMER.,68,27.0,95.0,0.726483,...,74.007874,1.8,,11.0,5.4,594.0,37.388889,3.669696,98.0,
4,21880799,2134-05-22 17:58:00,54.966667,0,M,EW EMER.,52,24.0,100.0,9.227289,...,66.070866,1.0,,7.8,8.4,500.0,36.555556,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,61.011811,1.0,,10.3,18.6,519.0,37.055556,3.761299,96.0,
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,64.086614,1.4,,9.0,10.7,308.0,36.888889,12.378938,100.0,
16401,29974575,2131-03-03 19:54:00,98.900000,1,M,EW EMER.,72,21.0,71.0,8.952254,...,70.039370,0.8,,10.0,6.0,447.0,36.888889,296.433466,100.0,
16402,29987115,2148-02-19 10:00:00,44.000000,1,F,URGENT,43,12.0,88.0,4.527693,...,59.921260,1.1,,9.0,6.5,594.0,36.722222,7.949843,99.0,


In [9]:
copd_df = pd.merge(left = copd, right=df_master, how = 'left', left_on='hadm_id', right_on='hadm_id')

In [10]:
copd_df

Unnamed: 0,hadm_id,seq_num,icd_code,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,...,height,lactic_acid,bnp,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,29500472,8,496,2174-06-16 13:44:00,168.400000,0,M,EW EMER.,60,22.0,...,72.917323,1.6,1194.0,8.9,5.9,461.0,36.444444,18.906789,98.0,0.824958
1,29912012,18,496,2125-04-23 06:58:00,77.400000,0,F,EW EMER.,77,20.0,...,66.964567,1.5,,9.7,11.2,465.0,37.111111,12.537942,97.0,
2,22856743,19,496,2133-03-06 13:25:00,49.250000,1,M,URGENT,73,27.0,...,70.039370,2.0,1870.0,10.8,0.6,533.0,35.666667,13.377308,99.0,0.117851
3,29794854,12,496,2156-03-01 14:54:00,23.350000,1,M,URGENT,79,21.0,...,,1.2,,11.7,10.9,322.0,35.833333,7.395945,95.0,0.039284
4,27323423,13,496,2173-05-03 15:20:00,13.333333,0,F,EW EMER.,38,16.0,...,64.980315,0.7,,10.8,20.3,474.0,36.666667,14.729891,95.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812,21353701,6,J440,2149-10-21 16:00:00,257.016667,0,M,EW EMER.,73,12.0,...,72.023622,1.6,,14.7,25.9,464.0,37.333333,,96.0,
1813,23378172,4,J440,2157-11-07 10:06:00,43.600000,0,M,OBSERVATION ADMIT,69,8.0,...,68.055118,0.6,,8.3,10.3,421.0,36.944444,,99.0,
1814,23378172,9,J441,2157-11-07 10:06:00,43.600000,0,M,OBSERVATION ADMIT,69,8.0,...,68.055118,0.6,,8.3,10.3,421.0,36.944444,,99.0,
1815,20397401,16,J449,2138-07-23 16:56:00,64.933333,0,F,EW EMER.,89,34.0,...,59.921260,,,7.5,6.8,1.7,37.111111,4.894117,100.0,0.157135


#### select only patients that were intubated for longer than 6 hours 

To get rid of all the following warnings, sHould change this to the df.where df.copy together? https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking


#df6 = df_master[(df_master['time_on_vent']>6)]
df6 = df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class', 'gender',
       'admission_type', 'anchor_age', 'heartrate',
       'weight',  'hco3', 
        'std_pulseox', 'std_heartrate',
       'creatinine', 'std_tidalvolume', 'bun',  'height',
       'tidalvolume', 'temp',
       'std_bloodpressure']]

In [11]:
df6 = copd_df.copy()

In [12]:
df6.columns

Index(['hadm_id', 'seq_num', 'icd_code', 'endtime', 'time_on_vent',
       're_intub_class', 'gender', 'admission_type', 'anchor_age', 'spontrr',
       'heartrate', 'std_spontrr', 'weight', 'sodium', 'abg_po2', 'abg_ph',
       'hco3', 'abg_pco2', 'bloodpressure', 'std_pulseox', 'std_heartrate',
       'vbg_pco2', 'creatinine', 'std_tidalvolume', 'bun', 'vbg_ph', 'height',
       'lactic_acid', 'bnp', 'hemoglobin', 'wbg', 'tidalvolume', 'temp',
       'std_bloodpressure', 'pulseox', 'std_temp'],
      dtype='object')

In [13]:
df6.isnull().sum()

hadm_id                 0
seq_num                 0
icd_code                0
endtime                 0
time_on_vent            0
re_intub_class          0
gender                  0
admission_type          0
anchor_age              0
spontrr                 0
heartrate               0
std_spontrr            17
weight                307
sodium                 10
abg_po2               257
abg_ph                257
hco3                   12
abg_pco2              257
bloodpressure           0
std_pulseox            14
std_heartrate          12
vbg_pco2             1144
creatinine             12
std_tidalvolume      1075
bun                    12
vbg_ph               1074
height                380
lactic_acid           226
bnp                  1670
hemoglobin             13
wbg                    12
tidalvolume             2
temp                  206
std_bloodpressure      17
pulseox                 0
std_temp             1037
dtype: int64

In [14]:
df6.drop(['std_temp','bnp','vbg_ph', 'std_tidalvolume','vbg_pco2'],axis=1,inplace=True)

In [15]:
df6.dropna(axis = 0,inplace=True)

In [16]:
df6.isnull().sum()

hadm_id              0
seq_num              0
icd_code             0
endtime              0
time_on_vent         0
re_intub_class       0
gender               0
admission_type       0
anchor_age           0
spontrr              0
heartrate            0
std_spontrr          0
weight               0
sodium               0
abg_po2              0
abg_ph               0
hco3                 0
abg_pco2             0
bloodpressure        0
std_pulseox          0
std_heartrate        0
creatinine           0
bun                  0
height               0
lactic_acid          0
hemoglobin           0
wbg                  0
tidalvolume          0
temp                 0
std_bloodpressure    0
pulseox              0
dtype: int64

In [17]:
df6[df6['re_intub_class']==1]

Unnamed: 0,hadm_id,seq_num,icd_code,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
2,22856743,19,496,2133-03-06 13:25:00,49.250000,1,M,URGENT,73,27.0,...,0.8,37.0,70.039370,2.0,10.8,0.6,533.0,35.666667,13.377308,99.0
26,22813114,18,496,2122-10-07 12:19:00,605.783333,1,M,URGENT,68,23.0,...,3.0,40.0,66.964567,1.5,6.7,13.5,469.0,38.055556,2.857738,96.0
33,25491134,13,496,2159-12-18 14:53:00,680.816667,1,F,URGENT,85,30.0,...,0.3,24.0,61.011811,0.5,9.4,8.0,203.0,36.333333,2.645751,96.0
96,28650948,6,496,2157-06-03 12:00:00,32.500000,1,M,EW EMER.,62,10.0,...,0.5,15.0,72.023622,1.0,13.8,32.9,562.0,36.333333,7.368853,98.0
99,26729806,15,496,2186-01-03 11:00:00,31.850000,1,M,EW EMER.,55,10.0,...,0.6,29.0,72.023622,0.7,7.9,14.5,890.0,36.888889,4.183300,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,24782337,13,J440,2187-05-14 11:31:00,65.066667,1,M,EW EMER.,76,11.0,...,1.3,33.0,68.055118,1.8,12.6,19.4,690.0,37.222222,12.044362,90.0
1753,26383749,17,J449,2161-01-19 10:18:00,282.566667,1,F,DIRECT EMER.,80,18.0,...,0.7,47.0,64.086614,1.0,9.1,14.2,278.0,37.333333,6.177918,95.0
1759,25320808,9,J441,2163-12-20 15:45:00,23.750000,1,M,EW EMER.,51,15.0,...,0.7,9.0,68.055118,0.7,11.6,13.9,521.0,37.000000,80.579567,99.0
1761,25320808,6,J440,2163-12-20 15:45:00,23.750000,1,M,EW EMER.,51,15.0,...,0.7,9.0,68.055118,0.7,11.6,13.9,521.0,37.000000,80.579567,99.0


In [18]:
df6.describe()

Unnamed: 0,hadm_id,seq_num,time_on_vent,re_intub_class,anchor_age,spontrr,heartrate,std_spontrr,weight,sodium,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
count,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0,...,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0,868.0
mean,25105130.0,12.582949,105.944931,0.104839,67.97235,19.35023,84.235023,4.479268,185.110599,139.81106,...,1.3947,31.175115,66.222075,1153.441129,9.776728,12.026728,454.155968,36.961367,9.443751,97.042627
std,2818001.0,6.832738,144.68667,0.306522,10.912113,5.721844,16.876102,2.596809,57.471766,4.75841,...,1.19526,22.444585,4.125442,33942.131223,1.82738,5.412681,164.642146,1.935166,13.180302,2.654982
min,20006410.0,1.0,6.016667,0.0,36.0,0.0,0.0,0.0,2.2,125.0,...,0.2,3.0,53.074803,0.5,6.5,0.3,0.0,-17.777778,0.547723,86.0
25%,22720790.0,7.0,22.025,0.0,60.0,15.0,73.0,2.625422,146.2,136.0,...,0.7,16.0,62.996063,1.0,8.4,8.0,355.0,36.722222,5.085928,95.0
50%,25119690.0,12.0,56.508333,0.0,68.0,19.0,83.0,3.876568,177.4,140.0,...,1.0,24.0,66.070866,1.2,9.4,11.3,442.5,37.0,7.299533,97.0
75%,27487270.0,17.0,127.395833,0.0,76.0,23.0,94.0,5.851968,217.8,143.0,...,1.6,40.0,69.704232,1.625,10.9,14.525,542.0,37.277778,10.667313,99.0
max,29989760.0,38.0,1252.066667,1.0,91.0,38.0,147.0,22.51243,509.3,156.0,...,9.4,134.0,78.770079,999999.0,16.4,43.7,1278.0,39.611111,296.433466,100.0


#### tidal volume should be below 2000, weight shoule be above 80

In [19]:
df6.drop(df6[(df6['tidalvolume'] > 2000)|(df6['tidalvolume'] < 100)].index, inplace = True) 

In [20]:
df6.drop(df6[df6['weight'] < 50].index, inplace = True)

In [21]:
df6['admission_type'].value_counts()

EW EMER.                       337
URGENT                         254
OBSERVATION ADMIT              113
SURGICAL SAME DAY ADMISSION     86
DIRECT EMER.                    29
ELECTIVE                        24
Name: admission_type, dtype: int64

In [22]:
df6.drop(df6.loc[df6['admission_type']=='DIRECT OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='EU OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='SURGICAL SAME DAY ADMISSION'].index, inplace=True)

In [23]:
df6.reset_index(inplace=True)

In [24]:
df6.drop('index',axis=1,inplace=True)

In [25]:
df6

Unnamed: 0,hadm_id,seq_num,icd_code,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,...,creatinine,bun,height,lactic_acid,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox
0,29500472,8,496,2174-06-16 13:44:00,168.400000,0,M,EW EMER.,60,22.0,...,0.7,30.0,72.917323,1.6,8.9,5.9,461.0,36.444444,18.906789,98.0
1,29912012,18,496,2125-04-23 06:58:00,77.400000,0,F,EW EMER.,77,20.0,...,1.6,34.0,66.964567,1.5,9.7,11.2,465.0,37.111111,12.537942,97.0
2,22856743,19,496,2133-03-06 13:25:00,49.250000,1,M,URGENT,73,27.0,...,0.8,37.0,70.039370,2.0,10.8,0.6,533.0,35.666667,13.377308,99.0
3,27323423,13,496,2173-05-03 15:20:00,13.333333,0,F,EW EMER.,38,16.0,...,0.6,6.0,64.980315,0.7,10.8,20.3,474.0,36.666667,14.729891,95.0
4,20109446,19,496,2153-12-05 13:10:00,97.300000,0,F,URGENT,60,33.0,...,1.0,24.0,61.905512,1.5,10.5,17.3,387.0,38.222222,3.386247,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,29341821,9,J441,2185-01-09 13:25:00,84.916667,0,M,EW EMER.,66,18.0,...,1.7,75.0,66.964567,1.7,9.4,8.5,407.0,36.888889,5.913262,100.0
753,22989281,13,J449,2162-02-26 13:20:00,26.000000,0,M,EW EMER.,62,16.0,...,1.5,60.0,74.007874,1.7,7.7,26.4,552.0,36.944444,2.422120,96.0
754,23689276,13,J440,2163-09-16 13:21:00,125.016667,0,F,URGENT,77,15.0,...,1.4,54.0,59.921260,1.1,8.1,15.2,357.0,38.888889,6.534524,99.0
755,27173448,12,J449,2157-10-31 10:18:00,281.300000,0,M,URGENT,69,17.0,...,0.6,32.0,66.070866,0.5,7.6,9.9,764.0,37.000000,4.956477,100.0


In [26]:
df6.to_feather('../data/processed/clean_copd')

In [27]:
#adm_type=pd.get_dummies(df6['admission_type'],drop_first=True,prefix='adm_type')

don't include ethnicity in model (yet), also don't include admission location in model

In [28]:
#gender =pd.get_dummies(df6['gender'],drop_first=True,prefix = 'gender')

In [29]:
#df_for_model = pd.concat([df6,adm_type,gender],axis=1)

In [30]:
#df_for_model.drop(['hadm_id','admission_type','admission_location','ethnicity','gender'],axis=1,inplace=True)

In [31]:
#df_for_model

In [32]:
#df_for_model.to_csv('clean_ft_table.csv',index=False)