## Script to get clean feature table vefore passing to model

Cleaned feature table saved as 'clean_ft_table.csv'

#### import libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

#### get feature library

In [3]:
df = pd.read_feather('../data/processed/gender')

In [4]:
df_master = df.copy()

In [5]:
# select only useful columns for master dataframe
df_master=df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class',
                     'gender','admission_type','anchor_age']]

#### load each feature and merge with df_master on hadm_id

In [6]:
# all feature names are the same as the feature column so we can find the name of all features in
# the 'selected_features' directory, load each feature in turn, select only the relevant columns and merge these 
# with df_master on 'hadm_id'
dirList= os.listdir('../data/processed/selected_features')
filename = []

for fname in dirList:
    feature = np.append(filename, fname)
    feature_table = pd.read_feather('../data/processed/selected_features/'+fname)
    feature_columns = feature_table[['hadm_id',fname]]
    df_master = pd.merge(left = df_master, right = feature_columns, 
                     how='left', left_on = 'hadm_id',right_on = 'hadm_id')

In [7]:
df_master

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,spontrr,heartrate,std_spontrr,...,height,lactic_acid,bnp,hemoglobin,wbg,tidalvolume,temp,std_bloodpressure,pulseox,std_temp
0,28038802,2185-12-20 09:59:00,108.800000,0,M,EW EMER.,71,17.0,87.0,5.560918,...,,0.7,,12.6,6.6,568.0,36.555556,10.830303,88.0,0.274986
1,21790335,2140-03-11 13:44:00,33.000000,0,F,URGENT,34,13.0,78.0,0.447214,...,,2.9,,9.1,10.5,578.0,36.888889,1.923538,92.0,
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,0.0,127.0,11.631239,...,59.027559,0.8,48581.0,8.5,8.4,0.0,37.500000,7.120393,94.0,
3,22188993,2145-11-04 18:40:00,143.666667,0,M,EW EMER.,68,27.0,95.0,0.726483,...,74.007874,1.8,,11.0,5.4,594.0,37.388889,3.669696,98.0,
4,21880799,2134-05-22 17:58:00,54.966667,0,M,EW EMER.,52,24.0,100.0,9.227289,...,66.070866,1.0,,7.8,8.4,500.0,36.555556,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16399,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,13.0,94.0,1.863525,...,61.011811,1.0,,10.3,18.6,519.0,37.055556,3.761299,96.0,
16400,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,17.0,80.0,1.772811,...,64.086614,1.4,,9.0,10.7,308.0,36.888889,12.378938,100.0,
16401,29974575,2131-03-03 19:54:00,98.900000,1,M,EW EMER.,72,21.0,71.0,8.952254,...,70.039370,0.8,,10.0,6.0,447.0,36.888889,296.433466,100.0,
16402,29987115,2148-02-19 10:00:00,44.000000,1,F,URGENT,43,12.0,88.0,4.527693,...,59.921260,1.1,,9.0,6.5,594.0,36.722222,7.949843,99.0,


#### select only patients that were intubated for longer than 6 hours 

To get rid of all the following warnings, sHould change this to the df.where df.copy together? https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking

In [17]:
df6 = df_master.copy()
#df6 = df_master[(df_master['time_on_vent']>6)]
df6 = df_master[['hadm_id', 'endtime', 'time_on_vent', 're_intub_class', 'gender',
       'admission_type', 'anchor_age', 'heartrate',
       'weight',  'hco3', 
        
       'creatinine', 'bun',  'height',
       'tidalvolume', 'temp',
       ]]

In [18]:
df6.columns

Index(['hadm_id', 'endtime', 'time_on_vent', 're_intub_class', 'gender',
       'admission_type', 'anchor_age', 'heartrate', 'weight', 'hco3',
       'creatinine', 'bun', 'height', 'tidalvolume', 'temp'],
      dtype='object')

In [27]:
df6.isnull().sum()

hadm_id           0
endtime           0
time_on_vent      0
re_intub_class    0
gender            0
admission_type    0
anchor_age        0
heartrate         0
weight            0
hco3              0
creatinine        0
bun               2
height            0
tidalvolume       0
temp              0
dtype: int64

In [28]:
df6.dropna(subset = ['height'],inplace=True)
df6.dropna(subset = ['weight'],inplace=True)
df6.dropna(subset = ['temp'],inplace=True)
df6.dropna(subset = ['hco3'],inplace=True)
df6.dropna(subset = ['creatinine'],inplace=True)
df6.dropna(subset = ['bun'],inplace=True)
df6.dropna(subset = ['tidalvolume'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6.dropna(subset = ['height'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6.dropna(subset = ['weight'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6.dropna(subset = ['temp'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6.dropna(subset = ['hco3'],inplac

In [30]:
df6[df6['re_intub_class']==0]

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,heartrate,weight,hco3,creatinine,bun,height,tidalvolume,temp
2,24357541,2177-02-10 14:00:00,659.416667,0,F,URGENT,70,127.0,167.4,28.0,3.0,57.0,59.027559,0.0,37.500000
6,28691076,2149-08-01 15:31:00,25.983333,0,F,EW EMER.,72,79.0,123.2,22.0,2.2,14.0,59.921260,387.0,36.444444
7,23452785,2163-04-17 05:00:00,17.000000,0,M,EW EMER.,23,122.0,264.0,23.0,0.7,23.0,70.933071,527.0,39.166667
8,27434217,2130-05-20 16:14:00,580.233333,0,F,EW EMER.,81,63.0,198.0,26.0,0.8,29.0,61.905512,437.0,36.833333
10,26451703,2187-11-01 15:08:00,170.000000,0,F,URGENT,65,80.0,205.9,35.0,1.0,15.0,61.905512,614.0,37.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15190,21257799,2171-09-04 10:51:00,21.216667,0,F,ELECTIVE,57,83.0,255.2,23.0,0.8,15.0,68.501969,401.0,37.333333
15191,24932959,2189-10-28 12:37:00,8.550000,0,M,EW EMER.,61,50.0,197.1,21.0,1.1,12.0,70.039370,482.0,36.222222
15194,29258317,2126-07-25 17:00:00,17.983333,0,M,EW EMER.,87,103.0,183.7,21.0,1.9,42.0,66.964567,92.0,37.333333
15195,27815009,2113-10-26 10:11:00,22.600000,0,M,EW EMER.,64,74.0,188.8,24.0,0.5,4.0,66.964567,472.0,37.111111


In [None]:
df6.describe()

#### tidal volume should be below 2000, weight shoule be above 80

In [31]:
df6.drop(df6[(df6['tidalvolume'] > 2000)|(df6['tidalvolume'] < 100)].index, inplace = True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [32]:
df6.drop(df6[df6['weight'] < 50].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [33]:
df6['admission_type'].value_counts()

EW EMER.                       4278
URGENT                         2093
OBSERVATION ADMIT               882
SURGICAL SAME DAY ADMISSION     812
DIRECT EMER.                    354
ELECTIVE                        284
EU OBSERVATION                    2
DIRECT OBSERVATION                1
Name: admission_type, dtype: int64

In [34]:
df6.drop(df6.loc[df6['admission_type']=='DIRECT OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='EU OBSERVATION'].index, inplace=True)
df6.drop(df6.loc[df6['admission_type']=='SURGICAL SAME DAY ADMISSION'].index, inplace=True)

In [35]:
df6.reset_index(inplace=True)

In [36]:
df6.drop('index',axis=1,inplace=True)

In [37]:
df6

Unnamed: 0,hadm_id,endtime,time_on_vent,re_intub_class,gender,admission_type,anchor_age,heartrate,weight,hco3,creatinine,bun,height,tidalvolume,temp
0,28691076,2149-08-01 15:31:00,25.983333,0,F,EW EMER.,72,79.0,123.2,22.0,2.2,14.0,59.921260,387.0,36.444444
1,23452785,2163-04-17 05:00:00,17.000000,0,M,EW EMER.,23,122.0,264.0,23.0,0.7,23.0,70.933071,527.0,39.166667
2,27434217,2130-05-20 16:14:00,580.233333,0,F,EW EMER.,81,63.0,198.0,26.0,0.8,29.0,61.905512,437.0,36.833333
3,26451703,2187-11-01 15:08:00,170.000000,0,F,URGENT,65,80.0,205.9,35.0,1.0,15.0,61.905512,614.0,37.333333
4,23481036,2112-05-25 09:52:00,83.866667,0,F,EW EMER.,61,73.0,199.5,26.0,0.9,23.0,66.070866,556.0,36.444444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7886,29947543,2144-04-02 15:17:00,134.333333,1,F,EW EMER.,68,108.0,77.4,26.0,0.5,13.0,59.921260,320.0,37.611111
7887,29951431,2173-10-10 09:32:00,14.783333,1,M,EW EMER.,65,75.0,189.2,24.0,0.9,18.0,70.039370,498.0,37.722222
7888,29960248,2146-12-09 14:23:00,73.466667,1,F,DIRECT EMER.,65,94.0,139.0,21.0,2.9,62.0,61.011811,519.0,37.055556
7889,29962016,2135-10-30 11:45:00,218.083333,1,F,EW EMER.,53,80.0,160.6,34.0,0.5,13.0,64.086614,308.0,36.888889


In [38]:
df6.to_feather('../data/processed/clean_ft_table_labs_strip')

In [None]:
#adm_type=pd.get_dummies(df6['admission_type'],drop_first=True,prefix='adm_type')

don't include ethnicity in model (yet), also don't include admission location in model

In [None]:
#gender =pd.get_dummies(df6['gender'],drop_first=True,prefix = 'gender')

In [None]:
#df_for_model = pd.concat([df6,adm_type,gender],axis=1)

In [None]:
#df_for_model.drop(['hadm_id','admission_type','admission_location','ethnicity','gender'],axis=1,inplace=True)

In [None]:
#df_for_model

In [None]:
#df_for_model.to_csv('clean_ft_table.csv',index=False)