In [104]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import math

In [105]:
pd.set_option('display.max_columns',None)

In [106]:
d_labitems = pd.read_csv('data/mimic-iv-0.4/hosp/d_labitems.csv.gz', compression='gzip')

In [107]:
admissions_sample = pd.read_pickle("data/admissions_sample.pkl")

In [108]:
admissions_sample

Unnamed: 0,subject_id,hadm_id,pos,ethnicity,ed_length,insurance,gender,age
0,19257432,24061325,1,WHITE,8.233333,Medicare,M,70
1,15812823,21055965,1,WHITE,3.033333,Medicare,M,84
2,14868010,21926866,1,WHITE,5.650000,Medicare,F,86
3,14237722,25273073,1,WHITE,4.933333,Medicare,F,66
4,17116333,28885585,1,WHITE,0.000000,Medicare,F,73
...,...,...,...,...,...,...,...,...
14995,18264133,26719786,0,WHITE,0.000000,Medicaid,F,51
14996,14609714,23634251,0,WHITE,21.900000,Medicaid,M,52
14997,19906137,23380658,0,WHITE,6.216667,Other,F,24
14998,15519782,29268751,0,ASIAN,4.983333,Other,M,18


## Pull Chart and Lab data only where it matches our sample patient list

In [119]:
# # We have Decided not to use Chart data after exploring it further
# # However, it still comes up in our discussions, so keeping this for now
# # Will need to update this code based on other updates we've made

# %%time #run this for next time

# chunksize = 3*(10 ** 6)
# counter=0
# chartevents_sample = []
# for chunk in pd.read_csv('data/mimic-iv-0.4/icu/chartevents.csv.gz', compression='gzip', chunksize=chunksize):
#     chartevents_sample.append(chunk[chunk['subject_id'].isin(list(admissions_sample.subject_id))])
#     counter+=1
#     print(f'chunk {counter} processed')

# chartevents_final = pd.concat(chartevents_sample)



In [120]:
# chartevents_final.to_pickle("./data/chartevents_final.pkl")

In [121]:
# chartevents_final = pd.read_pickle("./data/chartevents_final.pkl")

In [122]:
# chartevents_final.shape

Load Lab Events table

In [123]:
# %%time

# chunksize = 3*(10 ** 6)
# counter=0
# lab_events_sample = []
# for chunk in pd.read_csv('data/mimic-iv-0.4/hosp/labevents.csv.gz', compression='gzip', chunksize=chunksize):
#     lab_events_sample.append(chunk[chunk['subject_id'].isin(list(admissions_sample.subject_id))])
#     counter+=1
#     print(f'chunk {counter} processed')

# lab_events_final = pd.concat(lab_events_sample)
# # 5k patients
# # 3min 52s

# # 10k patients
# # 4min 6s

# # 15k patients
# # 4min 27s



In [124]:
# lab_events_final.to_pickle("./data/lab_events_final.pkl")

In [109]:
lab_events_final = pd.read_pickle("data/lab_events_final.pkl")

In [110]:
lab_events_final.shape

(6765210, 15)

## Manipulating the Data

Take only records related to our sample patients

Filter lab events based on first hospital visit of our sample population

In [111]:
# Can we do this filtering when generating lab data?
# For now let's not as we're discussing using different hospitalizations from the same subjects

%time lab_events_sampled = lab_events_final[lab_events_final['hadm_id'].isin(admissions_sample.hadm_id)]


CPU times: user 2.9 s, sys: 4.83 s, total: 7.74 s
Wall time: 10.3 s


Check how many patients have had each test done

In [112]:
lab_events_sampled.groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(10)

itemid
51221    8433
51265    8297
51222    8236
51301    8235
51248    8229
51279    8229
51277    8229
51250    8229
51249    8229
50912    7985
Name: subject_id, dtype: int64

Most common tests performed on patients with pos diagnosis

In [113]:
lab_events_sampled[lab_events_sampled['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(10)


itemid
50971    1196
50983    1193
50912    1192
50902    1191
51006    1191
51221    1181
51265    1181
50868    1180
50882    1180
50931    1174
Name: subject_id, dtype: int64

Item Black List

In [114]:
item_black_list = [50920]
# 50920 - test results are text in comments - need to come back and figure out how to handle this


Bonus Items To Add

In [115]:
# These test, although they have much missingness, are known to be good indicators of congestive heart failure

item_bonus_list = [51274, 51003, 50911, 51464]
# 51274 PT
# # 50920 Estimated GFR (MDRD equation)
# 51003 Troponin T
# 50911 Creatine Kinase, MB Isoenzyme
# 51464 Specific Gravity


Take only the X most commonly performed tests

In [116]:
top_test_num = 20

lab_events_valid = lab_events_sampled[~lab_events_sampled['itemid'].isin(item_black_list)]

# On patients with pos diagnosis
itemid_sub_sample = lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(top_test_num).reset_index().rename(columns = {'index' : 'itemid'}).itemid.to_list()
# On all sample patients
# itemid_sub_sample = lab_events_sampled.groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(top_test_num).reset_index().rename(columns = {'index' : 'itemid'}).itemid.to_list()

itemid_sub_sample = itemid_sub_sample + item_bonus_list

lab_events_sampled_sub = lab_events_sampled[lab_events_sampled['itemid'].isin(itemid_sub_sample)]


In [117]:
# # lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))

# top_labs = lab_events_sampled.groupby(['itemid']).agg(freq=('subject_id', lambda x: len(np.unique(x))))\
# .reset_index().sort_values('freq', ascending=False).head(60)

# top_labs


In [118]:
# subset_labs['subject_id'].reset_index(drop=True).plot(kind='line')

In [119]:
# print(lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(20).reset_index().rename(columns = {'index' : 'itemid'}))

# # print(1191 / admissions_sample[admissions_sample['pos'] == 1].subject_id.nunique())
# # print(1192 / admissions_sample[admissions_sample['pos'] == 1].subject_id.nunique())


In [120]:
# print(lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 0].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(20).reset_index().rename(columns = {'index' : 'itemid'}))

# # print(6726 / admissions_sample[admissions_sample['pos'] == 0].subject_id.nunique())
# # print(6793 / admissions_sample[admissions_sample['pos'] == 0].subject_id.nunique())


## Data Cleaning

Make a separate table where we run some of the forest models on the Text data?

Special updates for Urine sample tests

In [121]:
# Other invalid value imputation
# For now set them all to normal/most common value
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin([np.nan, 'ERROR', 'UNABLE TO REPORT'])), ['value', 'valuenum']] = 1.1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>21.8'])), ['value', 'valuenum']] = 22
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>19.2'])), ['value', 'valuenum']] = 20

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['>150', '>150.0'])), ['value', 'valuenum']] = 175

# Should we just delete these?
# For 5k patients there are 6 records here
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51274) &  (lab_events_sampled_sub['value'].isin(['ERROR'])), ['value', 'valuenum']] = 11

# Should we just delete these?
# For 5k patients there are 1 records here
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['UNABLE TO REPORT'])), ['value', 'valuenum']] = 31

# Clumsy..
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['34..3'])), ['value', 'valuenum']] = 34.3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


More Data Cleaning

In [122]:
%%time

# Can probably make a helper function combining some of the below mapping

# 51466
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 3
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE', 'LRG'])), ['value', 'valuenum']] = 4

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51514
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', 'NORMAL.  ', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['>8'])), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['>12.  ', '>12*.  '])), ['value', 'valuenum']] = 15

# 51464
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE'])), ['value', 'valuenum']] = 3

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51487
# Should Pos be marked abnormal?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin(['POS.  ', 'POS', 'POS*.  '])), ['value', 'valuenum']] = 1

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51492
# Should Pos be marked abnormal?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['___'])), ['value', 'valuenum']] = np.nan
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].str.contains('TR.')), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['>300'])), ['value', 'valuenum']] = 350
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['>600.  ', '>600*.  '])), ['value', 'valuenum']] = 700

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51486
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' ', 'N'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 3
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE'])), ['value', 'valuenum']] = 4

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51506
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Clear.  ', 'CLEAR.  ', 'Clear', ' ', 'CLEAR'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['SlHazy'])), ['value', 'valuenum']] = 0.5
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Hazy.  ', 'Hazy*.  ', 'Hazy', 'HAZY', 'HAZY*.  ', 'Slcldy', 'SLCLOUDY'])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Cloudy*.  ', 'Cloudy.  ', 'Cloudy', 'CLO', 'CLOUDY', 'CLOU', 'CLOUDY*.  '])), ['value', 'valuenum']] = 2

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0


CPU times: user 5.29 s, sys: 248 ms, total: 5.54 s
Wall time: 6.36 s


Additional Data Cleaning

In [123]:
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].isin([np.nan, ' ', 'TR', 'NEG'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].isin([np.nan, ' ', 'TR', 'NEG'])), ['value', 'valuenum']] = 0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# What are good replacement values for these?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].isin(['>80'])), ['value', 'valuenum']] = 120
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].isin(['>1000'])), ['value', 'valuenum']] = 1250

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['<1.005'])), ['value', 'valuenum']] = 1.000
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['<=1.005'])), ['value', 'valuenum']] = 1.000
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.030', '.1.030'])), ['value', 'valuenum']] = 1.035
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.035'])), ['value', 'valuenum']] = 1.040
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.050'])), ['value', 'valuenum']] = 1.055
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['comments'].isin(['>1.050*.'])), ['value', 'valuenum']] = 1.055
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>=1.035'])), ['value', 'valuenum']] = 1.040
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin([' '])), ['value', 'valuenum']] = 1.015

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>13.4'])), ['value', 'valuenum']] = 15

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51491) & (lab_events_sampled_sub['value'].isin([' '])), ['value', 'valuenum']] = 6.0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.070'])), ['value', 'valuenum']] = 1.080

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) & (lab_events_sampled_sub['value'].isin(['ERROR'])), ['value', 'valuenum']] = np.nan

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'


In [124]:
# Check on differences between value and valuenum columns

lab_events_sampled_sub.loc[((lab_events_sampled_sub['value'].astype(str) + str('.0')) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str) + str('0')) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str) + str('0')) 
                           & ((str('0') + lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str))
                           & ((lab_events_sampled_sub['value'].astype(str)) != round(lab_events_sampled_sub['valuenum'], 10).astype(str)), ]


Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments


## Code to look into specific test values for cleaning

In [125]:
# lab_events_sampled_sub[lab_events_sampled_sub['value'] == 'N']

In [126]:
# # Used to investigate bad data
# pd.set_option('display.max_colwidth', -1)
# pd.set_option('display.max_rows', 200)

# bad_id = 50920

# # lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id].value.value_counts().reset_index().sort_values(by = 'index')
# # lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id][['value', 'comments']].drop_duplicates()
# lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id][['flag', 'comments', 'value', 'valuenum', 'ref_range_lower', 'ref_range_upper']].drop_duplicates()
# # lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == bad_id) & (lab_events_sampled_sub['value'].isnull()), ][['value', 'valuenum', 'comments']].drop_duplicates()


In [127]:
# pd.set_option('display.max_rows', 20)

## Get Aggregate Test Values

In [128]:
%time lab_events_sampled_sub_grouped = lab_events_sampled_sub.groupby(['subject_id', 'hadm_id', 'itemid']).agg({'itemid' : ['count'], 'valuenum' : ['min', 'max', 'mean'], 'flag' : ['count'], 'ref_range_lower' : ['min'], 'ref_range_upper' : ['min']})
# 'comments' : ['unique'], 


CPU times: user 446 ms, sys: 403 ms, total: 849 ms
Wall time: 1.63 s


In [129]:
lab_events_sampled_sub_grouped.reset_index(inplace = True)
lab_events_sampled_sub_grouped.columns = ['subject_id', 'hadm_id', 'itemid', 'count', 'min', 'max', 'mean', 'abn_percent', 'range_min', 'range_max']
# 'comments', 


In [130]:
missing_vals = lab_events_sampled_sub_grouped.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()

min     1825
max     1825
mean    1825
dtype: int64

In [131]:
lab_events_sampled_sub_grouped.loc[lab_events_sampled_sub_grouped['min'].isnull(), ].itemid.value_counts()

# lab_events_sampled_sub_grouped.loc[(lab_events_sampled_sub_grouped['min'].isnull()) & (lab_events_sampled_sub_grouped['itemid'] == 51003), ]#.subject_id.nunique()
# # lab_events_sampled_sub_grouped.subject_id.nunique()

# lab_events_sampled_sub_grouped.loc[lab_events_sampled_sub_grouped['subject_id'] == 10002264, ]


51003    1494
50911     318
51274      10
51006       1
51277       1
51265       1
Name: itemid, dtype: int64

In [132]:
lab_events_sampled_sub_grouped['below_min'] = np.where(lab_events_sampled_sub_grouped['min'] < lab_events_sampled_sub_grouped['range_min'], lab_events_sampled_sub_grouped['range_min'] - lab_events_sampled_sub_grouped['min'], 0)
lab_events_sampled_sub_grouped['above_max'] = np.where(lab_events_sampled_sub_grouped['max'] > lab_events_sampled_sub_grouped['range_max'], lab_events_sampled_sub_grouped['max'] - lab_events_sampled_sub_grouped['range_max'], 0)
lab_events_sampled_sub_grouped['abn_percent'] = lab_events_sampled_sub_grouped['abn_percent'] / lab_events_sampled_sub_grouped['count']


## Merge Admissions data so we can group by ethnicity/gender to grab average range min and max

In [133]:
lab_adm = lab_events_sampled_sub_grouped.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')

lab_range_min_max_eg = lab_adm.groupby(['itemid', 'ethnicity', 'gender']).agg({'range_min' : 'mean', 'range_max' : 'mean'})
lab_range_min_max_eg.reset_index(inplace = True)

lab_range_eg_dic = dict(zip(zip(lab_range_min_max_eg['itemid'], lab_range_min_max_eg['ethnicity'], lab_range_min_max_eg['gender']), zip(lab_range_min_max_eg['range_min'], lab_range_min_max_eg['range_max'])))

# # These labs have missing data for the entire ethnicity/gender combo
lab_range_eg_dic[(50911, 'AMERICAN INDIAN/ALASKA NATIVE', 'M')] = (0.0, 10.0)
lab_range_eg_dic[(51003, 'AMERICAN INDIAN/ALASKA NATIVE', 'M')] = (0.0, 0.01)

lab_range_eg_dic


{(50868, 'AMERICAN INDIAN/ALASKA NATIVE', 'F'): (8.6, 19.4),
 (50868, 'AMERICAN INDIAN/ALASKA NATIVE', 'M'): (8.5, 19.5),
 (50868, 'ASIAN', 'F'): (8.352941176470589, 19.58823529411765),
 (50868, 'ASIAN', 'M'): (8.439024390243903, 19.528455284552845),
 (50868, 'BLACK/AFRICAN AMERICAN', 'F'): (8.36036036036036, 19.55855855855856),
 (50868, 'BLACK/AFRICAN AMERICAN', 'M'): (8.349869451697128,
  19.514360313315926),
 (50868, 'HISPANIC/LATINO', 'F'): (8.340909090909092, 19.613636363636363),
 (50868, 'HISPANIC/LATINO', 'M'): (8.331550802139038, 19.647058823529413),
 (50868, 'OTHER', 'F'): (8.414141414141413, 19.484848484848484),
 (50868, 'OTHER', 'M'): (8.288288288288289, 19.63963963963964),
 (50868, 'UNKNOWN', 'F'): (8.40552995391705, 19.52073732718894),
 (50868, 'UNKNOWN', 'M'): (8.519572953736654, 19.423487544483987),
 (50868, 'WHITE', 'F'): (8.352985074626865, 19.573880597014924),
 (50868, 'WHITE', 'M'): (8.332845647403072, 19.596927578639356),
 (50882, 'AMERICAN INDIAN/ALASKA NATIVE', 'F

In [134]:
# # Aggregate without splitting by Ethnicity/Gender

# lab_range_min_max = lab_events_sampled_sub_grouped.groupby('itemid').agg({'range_min' : 'mean', 'range_max' : 'mean'})
# lab_range_min_max.reset_index(inplace = True)

# lab_range_dic = dict(zip(lab_range_min_max['itemid'], zip(lab_range_min_max['range_min'], lab_range_min_max['range_max'])))
# lab_range_dic


Pivot the table so we have feature columns related to test results

In [135]:
lab_events_sampled_sub_grouped['new_index'] = list(zip(lab_events_sampled_sub_grouped['subject_id'], lab_events_sampled_sub_grouped['hadm_id']))


In [136]:
%time lab_events_sampled_pivot = lab_events_sampled_sub_grouped.pivot(index = 'new_index', columns = 'itemid', values = ['min', 'max', 'mean', 'abn_percent', 'below_min', 'above_max'])
# 'comments', 


CPU times: user 116 ms, sys: 34.1 ms, total: 150 ms
Wall time: 174 ms


In [137]:
lab_events_sampled_pivot = lab_events_sampled_pivot.reset_index()
lab_events_sampled_pivot['subject_id'], lab_events_sampled_pivot['hadm_id'] = zip(*lab_events_sampled_pivot['new_index'])
lab_events_sampled_pivot.drop(['new_index'], axis = 1, inplace = True)


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [138]:
missing_vals = lab_events_sampled_pivot.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()#.head(40)

             itemid
max          51221      299
above_max    51221      299
abn_percent  51221      299
below_min    51221      299
mean         51221      299
                       ... 
min          50911     6873
mean         50911     6873
             51003     7685
max          51003     7685
min          51003     7685
Length: 144, dtype: int64

## Start Data Imputation

Copy table, retain original for Random Forest

In [139]:
lab_events_impute = lab_events_sampled_pivot.copy()

In [140]:
d_labitems[d_labitems['itemid'].isin(itemid_sub_sample)]

Unnamed: 0,itemid,label,fluid,category,loinc_code
115,50868,Anion Gap,Blood,Chemistry,1863-0
212,50882,Bicarbonate,Blood,Chemistry,1963-8
222,51464,Bilirubin,Urine,Hematology,5770-3
282,50893,"Calcium, Total",Blood,Chemistry,2000-8
442,50902,Chloride,Blood,Chemistry,2075-0
511,50911,"Creatine Kinase, MB Isoenzyme",Blood,Chemistry,6773-6
512,50912,Creatinine,Blood,Chemistry,2160-0
723,50931,Glucose,Blood,Chemistry,6777-7
761,51221,Hematocrit,Blood,Hematology,4544-3
771,51222,Hemoglobin,Blood,Hematology,718-7


In [141]:
lab_events_impute

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,subject_id,hadm_id
itemid,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,Unnamed: 145_level_1,Unnamed: 146_level_1
0,15.0,14.0,7.7,95.0,44.0,1.1,165.0,1.7,3.0,3.5,129.0,0.69,30.0,33.1,11.1,30.4,32.1,92.0,112.0,11.8,12.3,3.59,11.0,0.0,23.0,22.0,8.7,103.0,170.0,1.6,370.0,2.4,4.1,4.8,137.0,6.74,45.0,43.0,14.3,31.8,34.5,97.0,216.0,18.9,13.2,4.58,36.8,0.0,18.900000,18.700000,8.270000,99.545455,112.166667,1.420000,252.900000,2.172727,3.530000,4.000000,133.090909,2.813333,40.000000,36.927273,12.330000,31.110000,33.400000,93.400000,168.181818,13.445455,12.760000,3.960000,21.620000,0.0,0.200000,0.900000,0.600000,0.090909,1.0,0.800000,1.000000,0.000000,0.000000,0.000000,0.363636,1.0,1.000000,0.818182,0.8,0.000,0.000000,0.0,0.272727,0.545455,0.0,1.0,1.000000,0.0,0.0,8.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,6.9,2.6,0.0,0.0,0.0,38.0,0.0,0.0,1.01,0.0,0.0,3.0,0.0,0.0,0.0,160.0,0.4,270.0,0.0,0.0,0.0,0.0,6.73,25.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,0.0,26.8,0.0,10002495,24982426.0
1,9.0,23.0,,103.0,,1.0,96.0,1.9,,4.2,139.0,,11.0,26.4,9.9,29.6,35.9,80.0,151.0,14.6,13.9,3.28,7.0,,13.0,31.0,,113.0,,1.6,138.0,2.3,,4.6,143.0,,18.0,34.8,12.2,30.7,38.4,83.0,184.0,16.6,14.7,3.98,12.4,,10.666667,27.250000,,107.500000,,1.233333,117.666667,2.100000,,4.416667,140.666667,,15.833333,30.800000,11.200000,30.140000,37.300000,81.200000,172.200000,15.600000,14.300000,3.706000,9.560000,,0.000000,0.000000,,0.500000,,0.333333,0.666667,0.000000,,0.000000,0.000000,,0.000000,1.000000,1.0,0.000,1.000000,0.6,0.000000,1.000000,0.0,1.0,0.200000,,0.0,0.0,,0.0,,0.0,0.0,0.0,,0.0,0.0,,0.0,13.6,4.1,0.0,0.0,2.0,0.0,0.0,0.0,1.32,0.0,,0.0,0.0,,5.0,,0.4,33.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,3.4,0.0,0.0,3.2,0.0,0.0,1.4,,10002527,29112696.0
2,12.0,25.0,8.5,91.0,3.0,1.1,116.0,2.0,3.3,3.7,130.0,,20.0,36.8,12.1,31.9,31.3,99.0,271.0,12.7,14.4,3.70,6.4,0.0,17.0,36.0,9.0,95.0,3.0,1.3,166.0,2.7,5.0,5.5,138.0,,37.0,41.2,13.4,32.8,32.8,102.0,460.0,12.7,15.1,4.15,19.3,0.0,14.888889,30.444444,8.766667,93.000000,3.000000,1.212500,144.500000,2.366667,4.066667,4.660000,133.100000,,29.250000,38.887500,12.600000,32.487500,32.312500,100.500000,354.125000,12.700000,14.800000,3.871250,10.850000,0.0,0.000000,0.222222,0.000000,1.000000,0.0,0.750000,1.000000,0.333333,0.333333,0.100000,0.400000,0.0,0.875000,0.000000,0.0,0.875,0.000000,1.0,0.125000,0.000000,0.0,1.0,0.375000,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.2,61.0,0.1,0.5,0.4,0.0,0.00,17.0,0.0,0.0,0.8,0.0,4.0,20.0,0.0,0.0,0.0,8.3,0.0,10005012,28371912.0
3,13.0,24.0,8.7,108.0,,0.6,93.0,2.0,3.1,4.2,141.0,,9.0,31.7,10.4,32.1,32.7,99.0,343.0,,12.0,3.22,6.0,,13.0,24.0,8.7,108.0,,0.6,93.0,2.0,3.1,4.2,141.0,,9.0,33.5,10.4,32.1,32.7,99.0,343.0,,12.0,3.22,6.0,,13.000000,24.000000,8.700000,108.000000,,0.600000,93.000000,2.000000,3.100000,4.200000,141.000000,,9.000000,32.600000,10.400000,32.100000,32.700000,99.000000,343.000000,,12.000000,3.220000,6.000000,,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,1.000000,1.0,1.000,0.000000,1.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,8.3,3.6,0.0,0.0,0.0,0.0,,0.0,1.38,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.1,0.0,1.0,0.0,,0.0,0.0,0.0,,10010264,26641707.0
4,11.0,29.0,8.8,101.0,,0.8,101.0,1.9,4.5,4.3,137.0,,9.0,41.7,13.7,30.6,32.8,93.0,320.0,11.6,14.3,4.47,14.2,,11.0,29.0,8.8,101.0,,0.8,101.0,1.9,4.5,4.3,137.0,,9.0,41.7,13.7,30.6,32.8,93.0,320.0,11.6,14.3,4.47,14.2,,11.000000,29.000000,8.800000,101.000000,,0.800000,101.000000,1.900000,4.500000,4.300000,137.000000,,9.000000,41.700000,13.700000,30.600000,32.800000,93.000000,320.000000,11.600000,14.300000,4.470000,14.200000,,0.000000,0.000000,0.000000,0.000000,,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,1.0,0.000,0.000000,0.0,0.000000,0.000000,0.0,1.0,1.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,,0.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2,,10010920,24676144.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8727,12.0,26.0,,105.0,3.0,1.1,105.0,,,3.9,139.0,,16.0,36.8,11.9,28.7,32.4,89.0,227.0,,12.0,4.15,4.6,,12.0,26.0,,105.0,3.0,1.1,105.0,,,3.9,139.0,,16.0,36.8,11.9,28.7,32.4,89.0,227.0,,12.0,4.15,4.6,,12.000000,26.000000,,105.000000,3.000000,1.100000,105.000000,,,3.900000,139.000000,,16.000000,36.800000,11.900000,28.700000,32.400000,89.000000,227.000000,,12.000000,4.150000,4.600000,,0.000000,0.000000,,0.000000,0.0,0.000000,0.000000,,,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.000,0.000000,0.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,,0.0,0.05,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,19993069,25459716.0
8728,12.0,26.0,9.1,104.0,,1.5,89.0,2.1,3.8,5.1,142.0,,21.0,36.8,11.9,32.2,32.3,100.0,201.0,,12.7,3.69,8.9,,12.0,26.0,9.1,104.0,,1.5,89.0,2.1,3.8,5.1,142.0,,21.0,37.6,11.9,32.2,32.3,100.0,201.0,,12.7,3.69,8.9,,12.000000,26.000000,9.100000,104.000000,,1.500000,89.000000,2.100000,3.800000,5.100000,142.000000,,21.000000,37.200000,11.900000,32.200000,32.300000,100.000000,201.000000,,12.700000,3.690000,8.900000,,0.000000,0.000000,0.000000,0.000000,,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,1.000000,1.000000,1.0,1.000,0.000000,1.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,3.2,1.8,0.0,0.0,0.0,0.0,,0.0,0.91,0.0,,0.0,0.0,0.0,0.0,,0.3,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.2,0.0,2.0,0.0,,0.0,0.0,0.0,,19995478,24108472.0
8729,10.0,16.0,7.5,86.0,,0.8,148.0,1.4,3.1,4.1,114.0,,13.0,23.1,7.7,26.7,33.3,77.0,226.0,13.2,11.7,2.85,7.9,0.0,18.0,23.0,8.7,97.0,,1.2,312.0,2.6,4.0,5.4,133.0,,20.0,30.6,10.5,27.3,35.0,81.0,334.0,14.2,12.6,3.87,11.7,0.0,14.055556,19.888889,8.293333,91.055556,,0.936842,215.666667,1.906667,3.433333,4.688889,125.222222,,16.888889,26.222222,8.977778,26.966667,34.211111,78.777778,280.555556,13.740000,12.233333,3.328889,9.500000,0.0,0.000000,0.611111,0.533333,0.888889,,0.000000,1.000000,0.066667,0.000000,0.000000,1.000000,,0.000000,1.000000,1.0,0.000,0.000000,1.0,0.000000,1.000000,0.0,1.0,0.333333,0.0,0.0,6.0,0.9,10.0,,0.0,0.0,0.2,0.0,0.0,21.0,,0.0,16.9,6.0,0.0,0.0,5.0,0.0,0.0,0.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,212.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7,0.0,0.0,1.7,0.0,19996783,25894657.0
8730,5.0,22.0,7.8,92.0,,0.5,76.0,1.8,1.3,2.8,131.0,,14.0,26.7,8.4,29.3,30.9,90.0,81.0,14.4,16.8,2.80,5.4,0.0,16.0,33.0,9.5,120.0,,0.8,225.0,2.8,5.1,5.1,155.0,,42.0,41.4,12.9,31.0,33.9,97.0,185.0,27.5,21.0,4.41,26.7,0.0,10.214286,27.642857,8.602857,99.190476,,0.611905,129.547619,2.139024,2.808571,4.364286,137.095238,,29.642857,30.673529,9.944118,30.038235,32.432353,92.735294,128.617647,17.682759,18.632353,3.312647,10.855882,0.0,0.404762,0.023810,0.285714,0.476190,,0.000000,0.904762,0.048780,0.485714,0.047619,0.500000,,0.809524,0.970588,1.0,0.000,0.235294,0.0,0.764706,1.000000,1.0,1.0,0.323529,0.0,5.0,0.0,0.6,4.0,,0.0,0.0,0.0,1.4,0.7,4.0,,0.0,13.3,5.3,0.0,1.1,0.0,69.0,0.0,0.0,1.80,0.0,0.0,0.0,1.0,0.0,12.0,,0.0,125.0,0.2,0.6,0.0,8.0,,22.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,5.5,0.0,16.7,0.0,19997886,20793010.0


In [142]:
lab_events_impute = lab_events_impute.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')



In [143]:
%%time

np.random.seed(0)

for labitem in lab_range_eg_dic:
    for ind in lab_events_impute.loc[(lab_events_impute[('mean', labitem[0])].isnull()) & (lab_events_impute['ethnicity'] == labitem[1]) & (lab_events_impute['gender'] == labitem[2])].index:
        val_max = lab_range_eg_dic[labitem][1]
        val_min = lab_range_eg_dic[labitem][0]

        val_ave = (val_max + val_min) / 2
        val_std = (val_max - val_ave) * .333

        ran_vals = np.random.normal(val_ave, val_std, 50)
        impute_min = min(ran_vals)
        impute_max = max(ran_vals)
        impute_mean = np.mean(ran_vals)
        
        val_min = lab_events_impute[('min', labitem[0])][ind]
        val_max = lab_events_impute[('max', labitem[0])][ind]
        val_mean = lab_events_impute[('mean', labitem[0])][ind]
        
        lab_events_impute[('min', labitem[0])][ind] = np.where(np.isnan(val_min), impute_min, val_min)
        lab_events_impute[('max', labitem[0])][ind] = np.where(np.isnan(val_max), impute_max, val_max)
        lab_events_impute[('mean', labitem[0])][ind] = np.where(np.isnan(val_mean), impute_mean, val_mean)
#         patient[('mean', labitem)].replace(np.nan, impute_mean)
#         patient[('abn_count', labitem)].replace(np.nan, 0)
# 5k patients
# 10 items - 2.5s
# 20 items - 6s
# 30 items - 28.6s

# 10k patients
# 40 items - 1m 34s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 20.5 s, sys: 358 ms, total: 20.8 s
Wall time: 22.5 s


In [144]:
for lab_itemid in lab_range_eg_dic:
    lab_events_impute[( 'above_max', lab_itemid[0])] = lab_events_impute[( 'above_max', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute[( 'below_min', lab_itemid[0])] = lab_events_impute[( 'below_min', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute[( 'abn_percent', lab_itemid[0])] = lab_events_impute[( 'abn_percent', lab_itemid[0])].replace(np.nan, 0)
#     lab_events_sampled_pivot[( 'comments', lab_itemid)] = lab_events_sampled_pivot[( 'comments', lab_itemid)].replace(np.nan, np.array(np.nan))


In [145]:
missing_vals = lab_events_impute.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()

Series([], dtype: int64)

In [146]:
lab_events_impute_KNN = lab_events_sampled_pivot.copy()

In [147]:
lab_events_impute_KNN = lab_events_impute_KNN.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')

dummy = pd.get_dummies(lab_events_impute_KNN['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
lab_events_impute_KNN = lab_events_impute_KNN.drop('ethnicity', axis=1)
lab_events_impute_KNN = pd.concat([lab_events_impute_KNN, dummy], axis=1)

dummy = pd.get_dummies(lab_events_impute_KNN['insurance'], prefix='insurance')
dummy.drop('insurance_Other', axis=1, inplace=True)
lab_events_impute_KNN = lab_events_impute_KNN.drop('insurance', axis=1)
lab_events_impute_KNN = pd.concat([lab_events_impute_KNN, dummy], axis=1)


lab_events_impute_KNN['gender'] = lab_events_impute_KNN['gender'].map({'M' : 0.0, 'F' : 1.0})
lab_events_impute_KNN['gender'] = pd.to_numeric(lab_events_impute_KNN['gender'])


In [148]:
%%time

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=math.trunc((lab_events_impute_KNN.shape[0] ** 0.5)))
lab_events_impute_KNN_imputed = imputer.fit_transform(lab_events_impute_KNN)

# 3min 8s

CPU times: user 1min 13s, sys: 13.5 s, total: 1min 26s
Wall time: 1min 32s


In [149]:
lab_events_impute_KNN_imputed = pd.DataFrame(lab_events_impute_KNN_imputed)
lab_events_impute_KNN_imputed.columns = lab_events_impute_KNN.columns

In [150]:
for lab_itemid in lab_range_eg_dic:
    lab_events_impute_KNN_imputed[( 'above_max', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'above_max', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute_KNN_imputed[( 'below_min', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'below_min', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute_KNN_imputed[( 'abn_percent', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'abn_percent', lab_itemid[0])].replace(np.nan, 0)
#     lab_events_sampled_pivot[( 'comments', lab_itemid)] = lab_events_sampled_pivot[( 'comments', lab_itemid)].replace(np.nan, np.array(np.nan))


## Standardize the Data for use in Models

In [151]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [152]:
labs_scaled = lab_events_impute.copy()
labs_scaled_KNN = lab_events_impute_KNN_imputed.copy()
labs_scaled_missing_data = lab_events_sampled_pivot.copy()

In [153]:
labs_scaled_missing_data = labs_scaled_missing_data.merge(admissions_sample, how='left', left_on='hadm_id', right_on='hadm_id')



In [154]:
scale_cols = ~labs_scaled.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'ethnicity', 'gender', 'insurance'])

scaler.fit(labs_scaled.loc[:, scale_cols])
labs_scaled.loc[:, scale_cols] = scaler.transform(labs_scaled.loc[:, scale_cols])

scale_cols = ~labs_scaled_KNN.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_ASIAN', 'ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_HISPANIC/LATINO', 'ethnicity_OTHER', 'ethnicity_UNKNOWN', 'insurance_Medicare','insurance_Medicaid'])

scaler.fit(labs_scaled_KNN.loc[:, scale_cols])
labs_scaled_KNN.loc[:, scale_cols] = scaler.transform(labs_scaled_KNN.loc[:, scale_cols])

scale_cols = ~labs_scaled_missing_data.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'ethnicity', 'gender','insurance'])

scaler.fit(labs_scaled_missing_data.loc[:, scale_cols])
labs_scaled_missing_data.loc[:, scale_cols] = scaler.transform(labs_scaled_missing_data.loc[:, scale_cols])


## Save Lab Results for Models

In [155]:
# Save non-imputed file as well for forest models

pos_final_test = labs_scaled.copy()
pos_KNN_final_test = labs_scaled_KNN.copy()
pos_final_missing_data_test = labs_scaled_missing_data.copy()


In [156]:
dummy = pd.get_dummies(pos_final_test['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
pos_final_test = pos_final_test.drop('ethnicity', axis=1)
pos_final_test = pd.concat([pos_final_test, dummy], axis=1)

pos_final_test['gender'] = pos_final_test['gender'].map({'M' : 0.0, 'F' : 1.0})
pos_final_test['gender'] = pd.to_numeric(pos_final_test['gender'])



dummy = pd.get_dummies(pos_final_missing_data_test['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
pos_final_missing_data_test = pos_final_missing_data_test.drop('ethnicity', axis=1)
pos_final_missing_data_test = pd.concat([pos_final_missing_data_test, dummy], axis=1)

pos_final_missing_data_test['gender'] = pos_final_missing_data_test['gender'].map({'M' : 0.0, 'F' : 1.0})
pos_final_missing_data_test['gender'] = pd.to_numeric(pos_final_missing_data_test['gender'])

In [157]:
labitem_names = d_labitems.loc[d_labitems['itemid'].isin(itemid_sub_sample)].copy()
labitem_names['Short Name'] = labitem_names['label'] + ' - ' + labitem_names['fluid'] + ' - ' + labitem_names['category']

labitem_names = labitem_names[['itemid', 'Short Name']]
labitem_names.set_index('itemid', inplace = True)
labitem_dict = labitem_names.to_dict()['Short Name']


In [158]:
def rename_lab_col(table_name):
    num_cols = ~table_name.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_ASIAN', 'ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_HISPANIC/LATINO', 'ethnicity_OTHER', 'ethnicity_UNKNOWN', 'age','ed_length', 'insurance_Medicare','insurance_Medicaid'])

    col_names = []
    col_names = list(table_name.loc[:, num_cols].columns)

    x, y = zip(*col_names)
    col_names = list(zip(x, list(map(labitem_dict.get, y))))

    rename_dict = dict(zip(table_name.loc[:, num_cols].columns, col_names))

    table_name.rename(columns = rename_dict, inplace = True)


In [159]:
rename_lab_col(pos_final_test)
rename_lab_col(pos_KNN_final_test)
rename_lab_col(pos_final_missing_data_test)

In [163]:
pos_final_test.to_csv('data/pos_final_test.csv.gz', compression="gzip")
pos_KNN_final_test.to_csv('data/pos_KNN_final_test.csv.gz', compression="gzip")
pos_final_missing_data_test.to_csv('data/pos_final_missing_data_test.csv.gz', compression="gzip")

In [162]:
pos_KNN_final_test

Unnamed: 0,hadm_id,"(min, Anion Gap - Blood - Chemistry)","(min, Bicarbonate - Blood - Chemistry)","(min, Calcium, Total - Blood - Chemistry)","(min, Chloride - Blood - Chemistry)","(min, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(min, Creatinine - Blood - Chemistry)","(min, Glucose - Blood - Chemistry)","(min, Magnesium - Blood - Chemistry)","(min, Phosphate - Blood - Chemistry)","(min, Potassium - Blood - Chemistry)","(min, Sodium - Blood - Chemistry)","(min, Troponin T - Blood - Chemistry)","(min, Urea Nitrogen - Blood - Chemistry)","(min, Hematocrit - Blood - Hematology)","(min, Hemoglobin - Blood - Hematology)","(min, MCH - Blood - Hematology)","(min, MCHC - Blood - Hematology)","(min, MCV - Blood - Hematology)","(min, Platelet Count - Blood - Hematology)","(min, PT - Blood - Hematology)","(min, RDW - Blood - Hematology)","(min, Red Blood Cells - Blood - Hematology)","(min, White Blood Cells - Blood - Hematology)","(min, Bilirubin - Urine - Hematology)","(max, Anion Gap - Blood - Chemistry)","(max, Bicarbonate - Blood - Chemistry)","(max, Calcium, Total - Blood - Chemistry)","(max, Chloride - Blood - Chemistry)","(max, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(max, Creatinine - Blood - Chemistry)","(max, Glucose - Blood - Chemistry)","(max, Magnesium - Blood - Chemistry)","(max, Phosphate - Blood - Chemistry)","(max, Potassium - Blood - Chemistry)","(max, Sodium - Blood - Chemistry)","(max, Troponin T - Blood - Chemistry)","(max, Urea Nitrogen - Blood - Chemistry)","(max, Hematocrit - Blood - Hematology)","(max, Hemoglobin - Blood - Hematology)","(max, MCH - Blood - Hematology)","(max, MCHC - Blood - Hematology)","(max, MCV - Blood - Hematology)","(max, Platelet Count - Blood - Hematology)","(max, PT - Blood - Hematology)","(max, RDW - Blood - Hematology)","(max, Red Blood Cells - Blood - Hematology)","(max, White Blood Cells - Blood - Hematology)","(max, Bilirubin - Urine - Hematology)","(mean, Anion Gap - Blood - Chemistry)","(mean, Bicarbonate - Blood - Chemistry)","(mean, Calcium, Total - Blood - Chemistry)","(mean, Chloride - Blood - Chemistry)","(mean, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(mean, Creatinine - Blood - Chemistry)","(mean, Glucose - Blood - Chemistry)","(mean, Magnesium - Blood - Chemistry)","(mean, Phosphate - Blood - Chemistry)","(mean, Potassium - Blood - Chemistry)","(mean, Sodium - Blood - Chemistry)","(mean, Troponin T - Blood - Chemistry)","(mean, Urea Nitrogen - Blood - Chemistry)","(mean, Hematocrit - Blood - Hematology)","(mean, Hemoglobin - Blood - Hematology)","(mean, MCH - Blood - Hematology)","(mean, MCHC - Blood - Hematology)","(mean, MCV - Blood - Hematology)","(mean, Platelet Count - Blood - Hematology)","(mean, PT - Blood - Hematology)","(mean, RDW - Blood - Hematology)","(mean, Red Blood Cells - Blood - Hematology)","(mean, White Blood Cells - Blood - Hematology)","(mean, Bilirubin - Urine - Hematology)","(abn_percent, Anion Gap - Blood - Chemistry)","(abn_percent, Bicarbonate - Blood - Chemistry)","(abn_percent, Calcium, Total - Blood - Chemistry)","(abn_percent, Chloride - Blood - Chemistry)","(abn_percent, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(abn_percent, Creatinine - Blood - Chemistry)","(abn_percent, Glucose - Blood - Chemistry)","(abn_percent, Magnesium - Blood - Chemistry)","(abn_percent, Phosphate - Blood - Chemistry)","(abn_percent, Potassium - Blood - Chemistry)","(abn_percent, Sodium - Blood - Chemistry)","(abn_percent, Troponin T - Blood - Chemistry)","(abn_percent, Urea Nitrogen - Blood - Chemistry)","(abn_percent, Hematocrit - Blood - Hematology)","(abn_percent, Hemoglobin - Blood - Hematology)","(abn_percent, MCH - Blood - Hematology)","(abn_percent, MCHC - Blood - Hematology)","(abn_percent, MCV - Blood - Hematology)","(abn_percent, Platelet Count - Blood - Hematology)","(abn_percent, PT - Blood - Hematology)","(abn_percent, RDW - Blood - Hematology)","(abn_percent, Red Blood Cells - Blood - Hematology)","(abn_percent, White Blood Cells - Blood - Hematology)","(abn_percent, Bilirubin - Urine - Hematology)","(below_min, Anion Gap - Blood - Chemistry)","(below_min, Bicarbonate - Blood - Chemistry)","(below_min, Calcium, Total - Blood - Chemistry)","(below_min, Chloride - Blood - Chemistry)","(below_min, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(below_min, Creatinine - Blood - Chemistry)","(below_min, Glucose - Blood - Chemistry)","(below_min, Magnesium - Blood - Chemistry)","(below_min, Phosphate - Blood - Chemistry)","(below_min, Potassium - Blood - Chemistry)","(below_min, Sodium - Blood - Chemistry)","(below_min, Troponin T - Blood - Chemistry)","(below_min, Urea Nitrogen - Blood - Chemistry)","(below_min, Hematocrit - Blood - Hematology)","(below_min, Hemoglobin - Blood - Hematology)","(below_min, MCH - Blood - Hematology)","(below_min, MCHC - Blood - Hematology)","(below_min, MCV - Blood - Hematology)","(below_min, Platelet Count - Blood - Hematology)","(below_min, PT - Blood - Hematology)","(below_min, RDW - Blood - Hematology)","(below_min, Red Blood Cells - Blood - Hematology)","(below_min, White Blood Cells - Blood - Hematology)","(below_min, Bilirubin - Urine - Hematology)","(above_max, Anion Gap - Blood - Chemistry)","(above_max, Bicarbonate - Blood - Chemistry)","(above_max, Calcium, Total - Blood - Chemistry)","(above_max, Chloride - Blood - Chemistry)","(above_max, Creatine Kinase, MB Isoenzyme - Blood - Chemistry)","(above_max, Creatinine - Blood - Chemistry)","(above_max, Glucose - Blood - Chemistry)","(above_max, Magnesium - Blood - Chemistry)","(above_max, Phosphate - Blood - Chemistry)","(above_max, Potassium - Blood - Chemistry)","(above_max, Sodium - Blood - Chemistry)","(above_max, Troponin T - Blood - Chemistry)","(above_max, Urea Nitrogen - Blood - Chemistry)","(above_max, Hematocrit - Blood - Hematology)","(above_max, Hemoglobin - Blood - Hematology)","(above_max, MCH - Blood - Hematology)","(above_max, MCHC - Blood - Hematology)","(above_max, MCV - Blood - Hematology)","(above_max, Platelet Count - Blood - Hematology)","(above_max, PT - Blood - Hematology)","(above_max, RDW - Blood - Hematology)","(above_max, Red Blood Cells - Blood - Hematology)","(above_max, White Blood Cells - Blood - Hematology)","(above_max, Bilirubin - Urine - Hematology)","(subject_id, )","(hadm_id, )",subject_id,pos,ed_length,gender,age,ethnicity_AMERICAN INDIAN/ALASKA NATIVE,ethnicity_ASIAN,ethnicity_BLACK/AFRICAN AMERICAN,ethnicity_HISPANIC/LATINO,ethnicity_OTHER,ethnicity_UNKNOWN,insurance_Medicaid,insurance_Medicare
0,24982426.0,1.182607,-2.509053,-0.957046,-1.148015,4.198359,0.239791,2.012795,-0.524119,0.119039,-0.613346,-1.828411,0.564400,1.446539,0.089090,0.107540,0.325997,-0.414806,0.470613,-1.066719,-0.455109,-0.977161,-0.063221,0.555704,-0.318961,2.096239,-1.554098,-0.574925,-0.546781,6.689081,0.268663,2.050968,0.098436,0.096660,0.516467,-1.046825,7.457160,1.124175,1.207629,1.138789,0.567534,0.475210,0.855524,-0.436155,0.240431,-0.634347,0.786336,2.358820,-0.427930,2.178426,-2.260178,-0.841833,-0.893599,7.101364,0.371979,3.388536,0.441568,0.162661,-0.230909,-1.840012,3.734095,1.602447,0.450004,0.436861,0.465090,0.087944,0.518691,-0.703359,-0.258525,-0.799455,0.168692,1.717525,-0.390929,0.974679,2.987161,0.953078,-0.238783,5.508363,1.791925,1.069869,-0.398571,-0.904213,-0.395129,1.508328,2.479962,1.816402,0.352709,0.274491,-0.637857,-0.657202,-0.466430,0.188966,0.148180,-0.503757,0.673312,1.788044,-0.161742,-0.266054,3.473338,0.808809,0.177877,0.0,-0.146966,-0.212479,-0.340748,-0.676723,-0.318632,1.531005,0.0,-0.263418,0.203608,0.229775,-0.236176,-0.378151,-0.233166,1.036957,-0.056411,0.0,0.357964,-0.233635,0.0,1.548027,-0.221230,-0.107311,-0.396345,6.720077,0.053716,2.060974,-0.053373,-0.337849,-0.236471,-0.209453,7.940673,1.023915,-0.06231,-0.037071,-0.373470,-0.348752,-0.275056,-0.216701,0.235945,-0.356701,-0.064079,2.404039,-0.427930,10002495.0,24982426.0,10002495.0,0.0,-0.808885,0.0,1.163393,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,29112696.0,-1.040646,-0.122525,-0.010158,0.498673,-0.245684,0.100355,-0.142880,0.140691,-0.066415,0.944656,0.548389,-0.483222,-0.392496,-0.952693,-0.429944,-0.001982,2.021947,-1.460730,-0.623152,0.422634,-0.035082,-0.475169,-0.130799,0.227965,-0.724522,0.984043,0.253902,1.575105,-0.255757,0.268663,-0.161654,0.034133,0.264695,0.208784,0.557140,-0.206202,-0.296981,-0.382912,-0.015748,0.130333,3.150191,-1.233227,-0.693000,-0.030542,0.063084,-0.189489,0.045995,0.117731,-1.186681,0.556404,0.126911,1.147068,-0.296598,0.170723,-0.086331,0.221689,0.109513,0.867753,0.581027,-0.330655,-0.232032,-0.680665,-0.154784,0.066483,2.867963,-1.419662,-0.660464,0.235059,0.027044,-0.224482,0.011025,0.175521,-0.349731,-0.558246,-0.052905,1.333809,-0.001066,0.421027,0.159216,-0.398571,0.101045,-0.395129,-0.416040,0.123047,-0.805807,0.790831,0.763399,-0.637857,2.178212,1.372004,-0.606384,1.428695,-0.503757,0.673312,-0.327845,0.091298,-0.266054,-0.462140,0.041320,-0.304879,0.0,-0.146966,-0.212479,-0.340748,0.022644,-0.318632,-0.299231,0.0,-0.263418,1.394304,0.971395,-0.236176,-0.378151,0.739380,-0.438314,-0.056411,0.0,0.814963,-0.233635,0.0,-0.237184,-0.221230,0.309672,1.634722,-0.170866,0.053716,-0.222857,-0.053373,0.422127,-0.236471,-0.209453,-0.256092,-0.447786,-0.06231,-0.037071,-0.373470,7.528135,-0.275056,-0.216701,-0.147266,-0.356701,-0.064079,-0.162401,0.117731,10002527.0,29112696.0,10002527.0,0.0,-0.808885,0.0,-0.424744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28371912.0,0.070980,0.407814,0.168329,-1.971359,-0.471009,0.239791,0.481953,0.473096,0.502312,-0.168203,-1.590731,-0.070022,0.478626,0.664403,0.555443,0.940957,-0.927807,1.597230,0.741667,-0.172977,0.259318,0.082954,-0.233774,-0.318961,0.403783,2.394121,-0.048176,-2.244291,-0.608638,0.029748,0.105387,0.291344,0.933835,1.593357,-0.779498,0.296345,0.703092,0.858486,0.643988,0.964990,-0.690808,1.601507,1.522294,-0.490017,0.249066,0.086995,0.700032,-0.427930,0.539015,1.608733,0.111809,-2.572776,-0.608054,0.148262,0.603161,1.027913,0.926558,1.509372,-1.837107,0.138875,0.786420,0.811726,0.578228,1.031154,-0.687253,1.646749,1.281619,-0.429301,0.295388,0.031314,0.193561,-0.390929,-0.349731,0.317163,-0.849657,3.255866,-0.837049,1.645043,1.069869,1.765617,0.238110,0.296591,1.700765,-1.445884,1.488626,-1.618837,-1.681143,1.592154,-0.657202,2.597626,-0.241849,-1.388438,-0.503757,0.673312,0.135006,-0.161742,-0.266054,-0.462140,-0.631061,2.108900,0.0,-0.146966,-0.212479,-0.340748,-0.676723,-0.318632,1.073446,0.0,-0.263418,-1.022631,-1.055701,-0.236176,-0.378151,-0.233166,-0.438314,-0.056411,0.0,-0.393873,-0.233635,0.0,-0.237184,3.236063,-0.107311,-0.396345,-0.455377,-0.116081,0.046963,0.012447,0.349678,0.981065,-0.209453,-0.410333,0.552971,-0.06231,-0.037071,0.536767,-0.348752,1.576738,0.117741,-0.530477,-0.356701,-0.064079,0.534781,-0.427930,10005012.0,28371912.0,10005012.0,0.0,-0.808885,1.0,1.368313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,26641707.0,0.441523,0.142645,0.449673,1.527853,-0.145268,-0.457391,-0.236605,0.473096,0.246797,0.944656,1.023749,-0.237000,-0.586079,-0.128596,-0.205992,1.022951,-0.030056,1.597230,1.560558,0.103087,-1.153801,-0.554901,-0.302425,-0.045498,-0.724522,-0.990067,-0.574925,0.514162,0.178883,-0.527722,-0.590826,-0.158775,-0.833534,-0.406581,0.022485,0.211190,-0.770700,-0.635071,-1.005351,0.686771,-0.759397,1.153917,0.583202,-0.049671,-1.192293,-1.425535,-0.560647,0.000803,-0.233007,-0.514227,-0.016196,1.275338,0.075107,-0.512108,-0.720149,-0.080645,-0.449406,0.296449,0.687553,-0.005327,-0.750747,-0.348510,-0.573648,0.871916,-0.411034,1.408427,1.162858,0.034295,-1.207338,-0.976775,-0.492718,-0.020088,-0.349731,-0.558246,-0.849657,-0.588248,0.012526,-0.558187,-1.662090,-0.398571,-0.904213,-0.395129,-0.416040,-0.312153,-0.805807,0.790831,0.763399,1.910727,-0.657202,2.597626,-0.606384,0.036775,-0.503757,0.673312,-0.856817,-0.094265,-0.266054,-0.462140,-0.631061,-0.304879,0.0,-0.146966,-0.212479,-0.340748,-0.676723,-0.318632,-0.299231,0.0,-0.263418,0.452410,0.724189,-0.236176,-0.378151,-0.233166,-0.438314,-0.056411,0.0,0.903414,-0.233635,0.0,-0.237184,-0.221230,-0.107311,-0.396345,0.267473,-0.285877,-0.540859,-0.053373,-0.337849,-0.236471,-0.209453,0.429985,-0.447786,-0.06231,-0.037071,-0.259691,-0.348752,0.187893,-0.216701,-0.049403,-0.356701,-0.064079,-0.303859,0.000803,10010264.0,26641707.0,10010264.0,0.0,-0.002302,0.0,-1.295657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24676144.0,-0.299562,1.468493,0.590345,0.087001,0.017603,-0.178518,0.013328,0.140691,2.035403,1.167227,0.073029,0.042711,-0.586079,1.426304,1.272088,0.407991,0.034070,0.631559,1.298968,-0.517804,0.200438,1.106180,1.104906,-0.100190,-1.288674,0.420012,-0.399342,-0.971159,-0.112443,-0.368445,-0.514529,-0.223078,0.468738,-0.252740,-1.046825,0.157124,-0.770700,0.955470,0.808922,0.090587,-0.690808,0.258738,0.398594,-0.619613,-0.122897,0.607435,0.216614,-0.038172,-1.050442,1.132897,0.175812,-0.520448,-0.092336,-0.296477,-0.514587,-0.382979,1.543370,0.560128,-0.590756,0.076160,-0.750747,1.330719,1.154166,0.255513,-0.339751,0.455138,0.917328,-0.681300,0.027044,0.958136,0.667589,-0.073065,-0.349731,-0.558246,-0.849657,-0.588248,0.146603,-0.558187,1.069869,-0.398571,-0.904213,-0.395129,-0.416040,0.019273,-0.805807,-1.618837,0.763399,-0.637857,-0.657202,-0.466430,-0.606384,-1.388438,-0.503757,0.673312,1.788044,0.378076,-0.266054,-0.462140,-0.631061,-0.304879,0.0,-0.146966,-0.212479,-0.340748,-0.676723,-0.318632,-0.299231,0.0,-0.263418,-1.022631,-0.907377,-0.236176,-0.378151,-0.233166,-0.438314,-0.056411,0.0,-0.939323,-0.233635,0.0,-0.237184,-0.221230,-0.107311,-0.396345,-0.064778,-0.285877,-0.531222,-0.053373,-0.337849,-0.236471,-0.209453,-0.094647,-0.447786,-0.06231,-0.037071,-0.373470,-0.348752,-0.275056,-0.216701,-0.530477,-0.356701,-0.064079,0.019472,-0.038172,10010920.0,24676144.0,10010920.0,0.0,0.460369,0.0,-1.090736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8727,25459716.0,0.070980,0.672984,-0.119065,0.910345,-0.471009,0.239791,0.138295,-0.030873,-0.074658,0.276940,0.548389,0.456454,0.091460,0.664403,0.465863,-0.370958,-0.222431,-0.012222,0.241233,-0.026350,-1.153801,0.680943,-0.542701,-0.264268,-1.006598,-0.426035,-0.150128,-0.122404,-0.608638,-0.129529,-0.476380,-0.023947,-0.131388,-0.868106,-0.512170,0.327703,-0.402252,0.005024,-0.180681,-0.664578,-0.965165,-0.338048,-0.347864,-0.191301,-1.192293,0.086995,-0.693350,0.078755,-0.641725,0.144623,-0.158889,0.505715,-0.608054,0.026969,-0.411805,-0.021863,-0.119751,-0.494588,0.048398,0.354311,-0.219380,0.426519,0.211722,-0.525264,-0.624881,-0.180387,-0.075464,-0.130920,-1.207338,0.462799,-0.690819,-0.062470,-0.349731,-0.558246,0.188859,-0.588248,-0.837049,-0.558187,-1.662090,0.077785,0.055701,-0.395129,-0.416040,-1.445884,-0.805807,-1.618837,0.763399,-0.637857,-0.657202,-0.466430,-0.606384,-0.018762,-0.503757,0.673312,-0.856817,0.138532,-0.266054,-0.462140,0.143062,-0.304879,0.0,-0.146966,-0.212479,-0.015753,0.105973,-0.318632,-0.299231,0.0,-0.263418,-1.022631,-1.006259,-0.236176,-0.378151,-0.233166,-0.438314,-0.056411,0.0,-1.057258,-0.233635,0.0,-0.237184,-0.221230,-0.061822,-0.396345,-0.455377,-0.285877,-0.540859,-0.017986,-0.105716,-0.236471,-0.209453,-0.410333,-0.447786,-0.06231,-0.037071,-0.373470,-0.348752,-0.275056,-0.216701,-0.196326,-0.356701,-0.064079,-0.303859,0.078755,19993069.0,25459716.0,19993069.0,0.0,1.338702,1.0,-0.117362,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8728,24108472.0,0.070980,0.672984,1.012361,0.704509,-0.206497,0.797538,-0.361572,0.805501,1.141100,2.947801,1.261429,0.158582,0.575417,0.664403,0.465863,1.063949,-0.286556,1.758175,-0.054478,0.033987,-0.741642,0.069665,0.195290,-0.318961,-1.006598,-0.426035,0.127406,-0.334593,-0.127480,0.189025,-0.628975,-0.094473,-0.182398,0.977992,0.289813,0.017768,-0.139075,0.160199,-0.180681,0.726516,-1.033754,1.303114,-0.556551,-0.053471,-0.866825,-0.661138,-0.285762,0.000803,-0.641725,0.144623,0.751838,0.249174,-0.166496,0.458231,-0.822931,0.221689,0.546982,2.669559,1.007130,0.047571,0.160167,0.500331,0.211722,0.913010,-0.696164,1.567308,-0.353019,-0.016533,-0.831656,-0.249249,-0.082365,-0.123597,-0.349731,-0.558246,-0.849657,-0.588248,-0.017149,2.379453,-1.662090,-0.398571,-0.904213,-0.395129,-0.416040,-0.067617,1.816402,0.790831,0.763399,1.910727,-0.657202,2.597626,-0.606384,0.021417,-0.503757,0.673312,-0.856817,0.138532,-0.266054,-0.462140,-0.631061,-0.304879,0.0,-0.146966,-0.212479,-0.340748,-0.676723,-0.318632,-0.299231,0.0,-0.263418,-0.453940,-0.165756,-0.236176,-0.378151,-0.233166,-0.438314,-0.056411,0.0,0.210545,-0.233635,0.0,-0.237184,-0.221230,-0.107311,-0.396345,-0.066707,-0.031182,-0.540859,-0.053373,-0.337849,-0.236471,-0.209453,0.139649,-0.388918,-0.06231,-0.037071,-0.145911,-0.348752,0.650841,-0.216701,-0.055584,-0.356701,-0.064079,-0.303859,0.000803,19995478.0,24108472.0,19995478.0,0.0,1.153139,0.0,1.060932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8729,25894657.0,-0.670104,-1.978713,-1.238390,-3.000539,-0.052199,-0.178518,1.481687,-1.521335,0.246797,0.722084,-5.393611,0.009056,-0.198914,-1.465811,-1.415331,-1.190905,0.354695,-1.943565,0.229859,-0.016237,-1.330441,-1.046581,0.023664,-0.318961,0.685859,-1.272083,-0.574925,-1.819913,0.113569,-0.049891,1.497813,0.227041,0.003641,1.439516,-2.116136,0.037097,-0.191710,-1.197580,-0.950373,-1.221016,0.818156,-1.531620,0.510964,-0.313296,-0.913320,-0.368390,-0.020356,-0.427930,0.198417,-1.868529,-0.797031,-3.071605,0.097535,-0.148940,2.431813,-0.362824,0.025064,1.585546,-4.354665,-0.014671,-0.151905,-1.525405,-1.318296,-1.237551,0.666125,-1.804509,0.496252,-0.191048,-1.082111,-0.808223,0.002535,-0.390929,-0.349731,1.849129,0.752774,2.828742,0.329498,-0.558187,1.069869,0.034267,-0.904213,-0.395129,4.875972,-0.194960,-0.805807,0.790831,0.763399,-0.637857,-0.657202,2.597626,-0.606384,1.428695,-0.503757,0.673312,0.024803,-0.161742,-0.266054,2.489468,1.220200,4.522680,0.0,-0.146966,-0.212479,2.177967,-0.676723,-0.318632,9.309508,0.0,-0.263418,1.980766,1.910782,-0.236176,-0.378151,2.198198,-0.438314,-0.056411,0.0,1.448864,-0.233635,0.0,-0.237184,-0.221230,-0.107311,-0.396345,0.166206,-0.285877,1.502062,-0.053373,-0.337849,-0.236471,-0.209453,0.127508,-0.447786,-0.06231,-0.037071,-0.373470,-0.348752,-0.275056,-0.216701,-0.326896,-0.356701,-0.064079,-0.132089,-0.427930,19996783.0,25894657.0,19996783.0,0.0,0.165942,0.0,1.573234,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8730,20793010.0,-2.522815,-0.387695,-0.816375,-1.765523,-0.185679,-0.596828,-0.767714,-0.191714,-2.052840,-2.171348,-1.353051,-0.435346,-0.102122,-0.906046,-1.101799,-0.124974,-1.184307,0.148723,-1.419297,0.359938,1.672437,-1.113024,-0.405400,-0.318961,0.121707,1.548074,0.829738,3.060426,-0.037732,-0.368445,0.668080,0.355647,1.026855,0.977992,3.765072,-0.366508,0.966269,0.897279,0.369098,0.249570,0.063674,0.855524,-0.684974,1.253633,2.992298,0.509853,1.401462,-0.427930,-1.371577,0.685821,-0.202719,-0.984665,-0.065117,-0.499273,0.218954,0.339673,-0.864229,0.729636,-0.560320,-0.415856,0.816242,-0.704003,-0.812339,0.024664,-0.601819,0.413081,-1.125714,0.712198,2.352165,-0.833364,0.194394,-0.390929,2.330622,-0.464452,0.008788,1.242282,-0.016797,-0.558187,0.809683,-0.081861,0.760315,-0.065738,2.229966,0.325320,1.316933,0.719958,0.763399,-0.637857,0.009954,-0.466430,1.623716,1.428695,2.355274,0.673312,-0.001127,-0.161742,8.046745,-0.462140,0.603113,1.626144,0.0,-0.146966,-0.212479,-0.340748,3.198067,5.182632,1.531005,0.0,-0.263418,1.340989,1.564692,-0.236176,1.323137,-0.233166,2.240468,-0.056411,0.0,1.522574,-0.233635,0.0,-0.237184,0.643093,-0.107311,4.478217,0.030702,-0.285877,0.663694,0.078268,0.487184,-0.236471,5.183850,-0.121599,0.847311,-0.06231,-0.037071,-0.373470,-0.348752,-0.275056,-0.216701,1.265826,3.460848,-0.064079,1.383525,-0.427930,19997886.0,20793010.0,19997886.0,0.0,1.521791,0.0,0.446170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Check for Multicollinearity

In [161]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
%%time

num_cols = ~labs_scaled.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity', 'age'])

labs_scaled_features = labs_scaled.loc[:, num_cols]
add_constant(labs_scaled_features)

high_vif = pd.Series([variance_inflation_factor(labs_scaled_features.values, i) 
               for i in range(labs_scaled_features.shape[1])], 
              index=labs_scaled_features.columns).sort_values(ascending = False)

high_vif.head(20)


In [None]:
labs_scaled_features_limited = labs_scaled_features.copy()

labs_scaled_features_limited.drop([('max', 50931),
                                   ('mean', 51222),
                                   ('max', 50912),
                                   ('min', 51221),
                                   ('mean', 50912),
                                   ('mean', 51265),
                                   ('max', 51006),
                                   ('mean', 51221)], axis = 1, inplace = True)

# # Top 40 labs
# labs_scaled_features_limited.drop([('above_max', 51484),
#                                   ('above_max', 51478),
#                                   ('above_max', 51466),
#                                   ('above_max', 51464),
#                                   ('max', 51003),
#                                   ('max', 50878),
#                                   ('max', 50861),
#                                   ('max', 51237),
#                                   ('mean', 51248),
#                                   ('max', 50931),
#                                   ('mean', 51222),
#                                   ('max', 51274),
#                                   ('mean', 51279),
#                                   ('mean', 51274),
#                                   ('mean', 51478),
#                                   ('mean', 50885),
#                                   ('mean', 51250),
#                                   ('max', 50911),
#                                   ('mean', 51003),
#                                   ('mean', 51466),
#                                   ('mean', 51484),
#                                   ('mean', 51486),
#                                   ('mean', 51277),
#                                   ('mean', 51221),
#                                   ('max', 50912),
#                                   ('max', 51275),
#                                   ('mean', 51492),
#                                   ('mean', 51514),
#                                   ('mean', 51006),
#                                   ('min', 51222),
#                                   ('max', 50960),
#                                   ('mean', 51249),
#                                   ('max', 50885),
#                                   ('mean', 51506),
#                                   ('max', 51248),
#                                   ('min', 51279),
#                                   ('mean', 50912),
#                                   ('mean', 51265),
#                                   ('mean', 51464),
#                                   ('min', 51248),
#                                   ('max', 51222),
#                                   ('max', 51301),
#                                   ('mean', 51487),
#                                   ('below_min', 51222),
#                                   ('max', 51006),
#                                   ('mean', 50902),
#                                   ('mean', 50911),
#                                   ('above_max', 50861),
#                                   ('mean', 50882),
#                                   ('below_min', 51279),
#                                   ('abn_percent', 51274),
#                                   ('mean', 51301),
#                                   ('mean', 50893),
#                                   ('max', 50863),
#                                   ('min', 51250),
#                                   ('mean', 50983),
#                                   ('mean', 50868),
#                                   ('mean', 50970),
#                                   ('mean', 51237),
#                                   ('mean', 50971),
#                                   ('mean', 51498),
#                                   ('mean', 51491),
#                                   ('max', 51277),
#                                   ('mean', 50878),
#                                   ('above_max', 51237),
#                                   ('mean', 51275),
#                                   ('min', 51221),
#                                   ('max', 51279),
#                                   ('below_min', 51248),
#                                   ('min', 50902),
#                                   ('mean', 50863),
#                                   ('mean', 50931),
#                                   ('max', 50902),
#                                   ('below_min', 51514),
#                                   ('max', 51466),
#                                   ('min', 50893),
#                                   ('above_max', 50912),
#                                   ('max', 51265),
#                                   ('min', 50882),
#                                   ('max', 50970),
#                                   ('mean', 50960),
#                                   ('min', 51249),
#                                   ('min', 50983),
#                                   ('min', 51237),
#                                   ('above_max', 51003),
#                                   ('above_max', 50885),
#                                   ('abn_percent', 51466),
#                                   ('max', 51486)], axis = 1, inplace = True)


In [None]:
%%time

high_vif = pd.Series([variance_inflation_factor(labs_scaled_features_limited.values, i) 
               for i in range(labs_scaled_features_limited.shape[1])], 
              index=labs_scaled_features_limited.columns).sort_values(ascending = False)

high_vif.head(20)
