In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import math

In [2]:
pd.set_option('display.max_columns',None)

In [116]:
d_labitems = pd.read_csv('data/mimic-iv-0.4/hosp/d_labitems.csv.gz', compression='gzip')

In [117]:
admissions_sample = pd.read_pickle("data/admissions_sample.pkl")

In [118]:
admissions_sample

Unnamed: 0,subject_id,hadm_id,pos,ethnicity,ed_length,insurance,gender,age
0,19257432,24061325,1,WHITE,8.233333,Medicare,M,70
1,15812823,21055965,1,WHITE,3.033333,Medicare,M,84
2,14868010,21926866,1,WHITE,5.650000,Medicare,F,86
3,14237722,25273073,1,WHITE,4.933333,Medicare,F,66
4,17116333,28885585,1,WHITE,0.000000,Medicare,F,73
...,...,...,...,...,...,...,...,...
14995,18264133,26719786,0,WHITE,0.000000,Medicaid,F,51
14996,14609714,23634251,0,WHITE,21.900000,Medicaid,M,52
14997,19906137,23380658,0,WHITE,6.216667,Other,F,24
14998,15519782,29268751,0,ASIAN,4.983333,Other,M,18


## Pull Chart and Lab data only where it matches our sample patient list

In [5]:
# # We have Decided not to use Chart data after exploring it further
# # However, it still comes up in our discussions, so keeping this for now
# # Will need to update this code based on other updates we've made

# %%time #run this for next time

# chunksize = 3*(10 ** 6)
# counter=0
# chartevents_sample = []
# for chunk in pd.read_csv('data/mimic-iv-0.4/icu/chartevents.csv.gz', compression='gzip', chunksize=chunksize):
#     chartevents_sample.append(chunk[chunk['subject_id'].isin(list(admissions_sample.subject_id))])
#     counter+=1
#     print(f'chunk {counter} processed')

# chartevents_final = pd.concat(chartevents_sample)



In [6]:
# chartevents_final.to_pickle("./data/chartevents_final.pkl")

In [7]:
# chartevents_final = pd.read_pickle("./data/chartevents_final.pkl")

In [8]:
# chartevents_final.shape

Load Lab Events table

In [9]:
# %%time

# chunksize = 3*(10 ** 6)
# counter=0
# lab_events_sample = []
# for chunk in pd.read_csv('data/mimic-iv-0.4/hosp/labevents.csv.gz', compression='gzip', chunksize=chunksize):
#     lab_events_sample.append(chunk[chunk['subject_id'].isin(list(admissions_sample.subject_id))])
#     counter+=1
#     print(f'chunk {counter} processed')

# lab_events_final = pd.concat(lab_events_sample)
# # 5k patients
# # 3min 52s

# # 10k patients
# # 4min 6s

# # 15k patients
# # 4min 27s



In [10]:
# lab_events_final.to_pickle("./data/lab_events_final.pkl")

In [13]:
lab_events_final = pd.read_pickle("data/lab_events_final.pkl")

In [14]:
lab_events_final.shape

(6765210, 15)

## Manipulating the Data

Take only records related to our sample patients

Filter lab events based on first hospital visit of our sample population

In [15]:
# Can we do this filtering when generating lab data?
# For now let's not as we're discussing using different hospitalizations from the same subjects

%time lab_events_sampled = lab_events_final[lab_events_final['hadm_id'].isin(admissions_sample.hadm_id)]


CPU times: user 1.71 s, sys: 699 ms, total: 2.41 s
Wall time: 2.42 s


Check how many patients have had each test done

In [18]:
lab_events_sampled.groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(10)

itemid
51221    8433
51265    8297
51222    8236
51301    8235
51248    8229
51279    8229
51277    8229
51250    8229
51249    8229
50912    7985
Name: subject_id, dtype: int64

Most common tests performed on patients with pos diagnosis

In [19]:
lab_events_sampled[lab_events_sampled['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(10)


itemid
50971    1196
50983    1193
50912    1192
50902    1191
51006    1191
51221    1181
51265    1181
50868    1180
50882    1180
50931    1174
Name: subject_id, dtype: int64

Item Black List

In [130]:
item_black_list = []
# # 50920 - test results are text in comments - need to come back and figure out how to handle this


Bonus Items To Add

In [131]:
# These test, although they have much missingness, are known to be good indicators of congestive heart failure

item_bonus_list = [51274, 51003, 50920, 50911, 51464]
# 51274 PT
# 50920 Estimated GFR (MDRD equation)
# 51003 Troponin T
# 50911 Creatine Kinase, MB Isoenzyme
# 51464 Specific Gravity


Take only the X most commonly performed tests

In [394]:
top_test_num = 20

lab_events_valid = lab_events_sampled[~lab_events_sampled['itemid'].isin(item_black_list)]

# On patients with pos diagnosis
itemid_sub_sample = lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(top_test_num).reset_index().rename(columns = {'index' : 'itemid'}).itemid.to_list()
# On all sample patients
# itemid_sub_sample = lab_events_sampled.groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(top_test_num).reset_index().rename(columns = {'index' : 'itemid'}).itemid.to_list()

itemid_sub_sample = itemid_sub_sample + item_bonus_list

lab_events_sampled_sub = lab_events_sampled[lab_events_sampled['itemid'].isin(itemid_sub_sample)]


In [395]:
# # lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))

# top_labs = lab_events_sampled.groupby(['itemid']).agg(freq=('subject_id', lambda x: len(np.unique(x))))\
# .reset_index().sort_values('freq', ascending=False).head(60)

# top_labs


In [134]:
# subset_labs['subject_id'].reset_index(drop=True).plot(kind='line')

In [135]:
# print(lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 1].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(20).reset_index().rename(columns = {'index' : 'itemid'}))

# # print(1191 / admissions_sample[admissions_sample['pos'] == 1].subject_id.nunique())
# # print(1192 / admissions_sample[admissions_sample['pos'] == 1].subject_id.nunique())


In [397]:
# print(lab_events_valid[lab_events_valid['subject_id'].isin(list(admissions_sample[admissions_sample['pos'] == 0].subject_id))].groupby('itemid')['subject_id'].nunique().sort_values(ascending = False).head(20).reset_index().rename(columns = {'index' : 'itemid'}))

# # print(6726 / admissions_sample[admissions_sample['pos'] == 0].subject_id.nunique())
# # print(6793 / admissions_sample[admissions_sample['pos'] == 0].subject_id.nunique())


In [398]:
# subset_labs['subject_id'].reset_index(drop=True).plot(kind='line')

## Data Cleaning

Make a separate table where we run some of the forest models on the Text data?

Special updates for Urine sample tests

In [399]:
# Other invalid value imputation
# For now set them all to normal/most common value
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin([np.nan, 'ERROR', 'UNABLE TO REPORT'])), ['value', 'valuenum']] = 1.1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>21.8'])), ['value', 'valuenum']] = 22
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>19.2'])), ['value', 'valuenum']] = 20

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['>150', '>150.0'])), ['value', 'valuenum']] = 175

# Should we just delete these?
# For 5k patients there are 6 records here
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51274) &  (lab_events_sampled_sub['value'].isin(['ERROR'])), ['value', 'valuenum']] = 11

# Should we just delete these?
# For 5k patients there are 1 records here
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['UNABLE TO REPORT'])), ['value', 'valuenum']] = 31

# Clumsy..
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) &  (lab_events_sampled_sub['value'].isin(['34..3'])), ['value', 'valuenum']] = 34.3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


More Data Cleaning

In [400]:
%%time

# Can probably make a helper function combining some of the below mapping

# 51466
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 3
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE', 'LRG'])), ['value', 'valuenum']] = 4

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51514
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', 'NORMAL.  ', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['>8'])), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51514) & (lab_events_sampled_sub['value'].isin(['>12.  ', '>12*.  '])), ['value', 'valuenum']] = 15

# 51464
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE'])), ['value', 'valuenum']] = 3

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51464) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51487
# Should Pos be marked abnormal?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['value'].isin(['POS.  ', 'POS', 'POS*.  '])), ['value', 'valuenum']] = 1

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51487) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51492
# Should Pos be marked abnormal?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['___'])), ['value', 'valuenum']] = np.nan
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' '])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].str.contains('TR.')), ['value', 'valuenum']] = 10
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['>300'])), ['value', 'valuenum']] = 350
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['value'].isin(['>600.  ', '>600*.  '])), ['value', 'valuenum']] = 700

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51492) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51486
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['NEG.  ', 'NEG', ' ', 'N'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['TR', 'TR.  ', 'TR*.  '])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['SM', 'SM .  ', 'SM*.  '])), ['value', 'valuenum']] = 2
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['MOD', 'MOD.  ', 'MOD*.  '])), ['value', 'valuenum']] = 3
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['value'].isin(['LG', 'LG.  ', 'LG*.  ', 'LGE'])), ['value', 'valuenum']] = 4

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51486) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# 51506
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin([np.nan])), ['value']] = lab_events_sampled_sub['comments']
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Clear.  ', 'CLEAR.  ', 'Clear', ' ', 'CLEAR'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['SlHazy'])), ['value', 'valuenum']] = 0.5
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Hazy.  ', 'Hazy*.  ', 'Hazy', 'HAZY', 'HAZY*.  ', 'Slcldy', 'SLCLOUDY'])), ['value', 'valuenum']] = 1
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['value'].isin(['Cloudy*.  ', 'Cloudy.  ', 'Cloudy', 'CLO', 'CLOUDY', 'CLOU', 'CLOUDY*.  '])), ['value', 'valuenum']] = 2

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51506) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0


CPU times: user 3.55 s, sys: 164 ms, total: 3.72 s
Wall time: 3.72 s


Additional Data Cleaning

In [401]:
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].isin([np.nan, ' ', 'TR', 'NEG'])), ['value', 'valuenum']] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].isin([np.nan, ' ', 'TR', 'NEG'])), ['value', 'valuenum']] = 0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['ref_range_lower'].isnull()), 'ref_range_lower'] = 0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['ref_range_upper'].isnull()), 'ref_range_upper'] = 0

# What are good replacement values for these?
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].isin(['>80'])), ['value', 'valuenum']] = 120
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].isin(['>1000'])), ['value', 'valuenum']] = 1250

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['<1.005'])), ['value', 'valuenum']] = 1.000
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['<=1.005'])), ['value', 'valuenum']] = 1.000
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.030', '.1.030'])), ['value', 'valuenum']] = 1.035
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.035'])), ['value', 'valuenum']] = 1.040
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.050'])), ['value', 'valuenum']] = 1.055
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['comments'].isin(['>1.050*.'])), ['value', 'valuenum']] = 1.055
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>=1.035'])), ['value', 'valuenum']] = 1.040
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin([' '])), ['value', 'valuenum']] = 1.015

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51237) & (lab_events_sampled_sub['value'].isin(['>13.4'])), ['value', 'valuenum']] = 15

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51491) & (lab_events_sampled_sub['value'].isin([' '])), ['value', 'valuenum']] = 6.0

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51498) & (lab_events_sampled_sub['value'].isin(['>1.070'])), ['value', 'valuenum']] = 1.080

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51275) & (lab_events_sampled_sub['value'].isin(['ERROR'])), ['value', 'valuenum']] = np.nan

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51484) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'
lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51478) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'

lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == 51466) & (lab_events_sampled_sub['value'].astype(float) > 0), 'flag'] = 'abnormal'


In [402]:
# Check on differences between value and valuenum columns

lab_events_sampled_sub.loc[((lab_events_sampled_sub['value'].astype(str) + str('.0')) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str) + str('0')) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str)) 
                           & ((lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str) + str('0')) 
                           & ((str('0') + lab_events_sampled_sub['value'].astype(str)) != lab_events_sampled_sub['valuenum'].astype(str))
                           & ((lab_events_sampled_sub['value'].astype(str)) != round(lab_events_sampled_sub['valuenum'], 10).astype(str)), ]


Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments


## Code to look into specific test values for cleaning

In [403]:
# lab_events_sampled_sub[lab_events_sampled_sub['value'] == 'N']

In [456]:
# Used to investigate bad data
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 200)

bad_id = 50920

# lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id].value.value_counts().reset_index().sort_values(by = 'index')
# lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id][['value', 'comments']].drop_duplicates()
lab_events_sampled_sub[lab_events_sampled_sub['itemid'] == bad_id][['flag', 'comments', 'value', 'valuenum', 'ref_range_lower', 'ref_range_upper']].drop_duplicates().head(100)
# lab_events_sampled_sub.loc[(lab_events_sampled_sub['itemid'] == bad_id) & (lab_events_sampled_sub['value'].isnull()), ][['value', 'valuenum', 'comments']].drop_duplicates()


  


Unnamed: 0,flag,comments,value,valuenum,ref_range_lower,ref_range_upper
2724941,,___,,,60.0,200.0
11632431,,"Using this patient's age, gender, and serum creatinine value of 0.2,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 20-29 is 116 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632439,,"Using this patient's age, gender, and serum creatinine value of 0.2,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 30-39 is 107 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632527,,"Using this patient's age, gender, and serum creatinine value of 0.2, . estimated GFR (eGFR) is likely >75 mL/min/1.73 m2, . provided the serum creatinine value is stable. . (Patients with more muscle mass and better nutritional status are more . likely to be at the higher end of this range.) . An eGFR < 60 suggests kidney disease in those below the age of 65 . and there may be kidney disease in those over 65..",75.0,75.0,60.0,200.0
11632528,,"Using this patient's age, gender, and serum creatinine value of 0.3,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 20-29 is 116 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632565,,"Using this patient's age, gender, and serum creatinine value of 0.3,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 30-39 is 107 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632614,,"Using this patient's age, gender, and serum creatinine value of 0.3,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 40-49 is 99 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632654,,"Using this patient's age, gender, and serum creatinine value of 0.3,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 50-59 is 93 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0
11632858,,"Using this patient's age, gender, and serum creatinine value of 0.3, . estimated GFR (eGFR) is likely >75 mL/min/1.73 m2, . provided the serum creatinine value is stable. . (Patients with more muscle mass and better nutritional status are more . likely to be at the higher end of this range.) . An eGFR < 60 suggests kidney disease in those below the age of 65 . and there may be kidney disease in those over 65..",75.0,75.0,60.0,200.0
11632991,,"Using this patient's age, gender, and serum creatinine value of 0.4,. Estimated GFR = >75 if non African-American (mL/min/1.73 m2). Estimated GFR = >75 if African-American (mL/min/1.73 m2). For comparison, mean GFR for age group 20-29 is 116 (mL/min/1.73 m2). GFR<60 = Chronic Kidney Disease, GFR<15 = Kidney Failure.",75.0,75.0,60.0,200.0


In [405]:
def gfr_grabber(patient):
    comment = patient['comments']
    gfr_vals = []
    
    if pd.isnull(comment):
        return np.nan
    if type(comment) == str:
        if comment == '___':
            return np.nan
        
        comment = comment.replace('>75', '75')
        if ('between' in comment):
            start_low = comment.find('between')
            end_low = comment.find(' and', start_low + 8, )

            end_high = comment.find(' mL', end_low + 5, )
            
            val = (int(comment[start_low + 8 : end_low]) + int(comment[end_low + 5 : end_high])) / 2

            gfr_vals = [val, val]
        elif 'likely' in comment:
            start_point = comment.find('likely')
            end_point = comment.find(' mL', start_point + 7, )
            gfr_vals = [comment[start_point + 7 : end_point], comment[start_point + 7 : end_point]]
        else:
            start_points = [i for i in range(len(comment)) if comment.startswith('GFR = ', i)]
            end_points = list(map(lambda x: comment.find(' ', x + 6, ), start_points))

            gfr_vals = list(map(lambda x, y: comment[x + 6 : y], start_points, end_points))

    if (admissions_sample.loc[admissions_sample['subject_id'] == patient['subject_id'], 'ethnicity'] == 'BLACK/AFRICAN AMERICAN').iloc[0]:
        return int(gfr_vals[1])
    else:
        return int(gfr_vals[0])
    
lab_gfr = lab_events_sampled_sub.loc[lab_events_sampled_sub['itemid'] == 50920, ['labevent_id', 'comments', 'subject_id']]
lab_gfr['value'] = lab_gfr.apply(gfr_grabber, axis = 1)


In [423]:
lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'value'] = lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'labevent_id'].map(lambda x: lab_gfr.loc[lab_gfr['labevent_id'] == x, 'value'])
lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'value'] = lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'value'].map(lambda x: x.iloc[0].astype(np.float64))
lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'valuenum'] = lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'value'].astype(np.float64)
lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'ref_range_lower'] = 60
lab_events_sampled_sub.loc[lab_events_sampled_sub['labevent_id'].isin(lab_gfr.labevent_id), 'ref_range_upper'] = 120



## Get Aggregate Test Values

In [425]:
%time lab_events_sampled_sub_grouped = lab_events_sampled_sub.groupby(['subject_id', 'hadm_id', 'itemid']).agg({'itemid' : ['count'], 'valuenum' : ['min', 'max', 'mean'], 'flag' : ['count'], 'ref_range_lower' : ['min'], 'ref_range_upper' : ['min']})
# 'comments' : ['unique'], 


CPU times: user 322 ms, sys: 45.3 ms, total: 367 ms
Wall time: 367 ms


In [426]:
lab_events_sampled_sub_grouped.reset_index(inplace = True)
lab_events_sampled_sub_grouped.columns = ['subject_id', 'hadm_id', 'itemid', 'count', 'min', 'max', 'mean', 'abn_percent', 'range_min', 'range_max']
# 'comments', 


In [427]:
missing_vals = lab_events_sampled_sub_grouped.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()

min     1863
max     1863
mean    1863
dtype: int64

In [428]:
lab_events_sampled_sub_grouped.loc[lab_events_sampled_sub_grouped['min'].isnull(), ].itemid.value_counts()

# lab_events_sampled_sub_grouped.loc[(lab_events_sampled_sub_grouped['min'].isnull()) & (lab_events_sampled_sub_grouped['itemid'] == 51003), ]#.subject_id.nunique()
# # lab_events_sampled_sub_grouped.subject_id.nunique()

# lab_events_sampled_sub_grouped.loc[lab_events_sampled_sub_grouped['subject_id'] == 10002264, ]


51003    1494
50911    318 
50920    38  
51274    10  
51006    1   
51277    1   
51265    1   
Name: itemid, dtype: int64

In [429]:
lab_events_sampled_sub_grouped['below_min'] = np.where(lab_events_sampled_sub_grouped['min'] < lab_events_sampled_sub_grouped['range_min'], lab_events_sampled_sub_grouped['range_min'] - lab_events_sampled_sub_grouped['min'], 0)
lab_events_sampled_sub_grouped['above_max'] = np.where(lab_events_sampled_sub_grouped['max'] > lab_events_sampled_sub_grouped['range_max'], lab_events_sampled_sub_grouped['max'] - lab_events_sampled_sub_grouped['range_max'], 0)
lab_events_sampled_sub_grouped['abn_percent'] = lab_events_sampled_sub_grouped['abn_percent'] / lab_events_sampled_sub_grouped['count']


## Merge Admissions data so we can group by ethnicity/gender to grab average range min and max

In [430]:
lab_adm = lab_events_sampled_sub_grouped.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')

lab_range_min_max_eg = lab_adm.groupby(['itemid', 'ethnicity', 'gender']).agg({'range_min' : 'mean', 'range_max' : 'mean'})
lab_range_min_max_eg.reset_index(inplace = True)

lab_range_eg_dic = dict(zip(zip(lab_range_min_max_eg['itemid'], lab_range_min_max_eg['ethnicity'], lab_range_min_max_eg['gender']), zip(lab_range_min_max_eg['range_min'], lab_range_min_max_eg['range_max'])))

# # These labs have missing data for the entire ethnicity/gender combo
lab_range_eg_dic[(50911, 'AMERICAN INDIAN/ALASKA NATIVE', 'M')] = (0.0, 10.0)
lab_range_eg_dic[(51003, 'AMERICAN INDIAN/ALASKA NATIVE', 'M')] = (0.0, 0.01)

lab_range_eg_dic


{(50868, 'AMERICAN INDIAN/ALASKA NATIVE', 'F'): (8.6, 19.4),
 (50868, 'AMERICAN INDIAN/ALASKA NATIVE', 'M'): (8.5, 19.5),
 (50868, 'ASIAN', 'F'): (8.352941176470589, 19.58823529411765),
 (50868, 'ASIAN', 'M'): (8.439024390243903, 19.528455284552845),
 (50868, 'BLACK/AFRICAN AMERICAN', 'F'): (8.36036036036036, 19.55855855855856),
 (50868, 'BLACK/AFRICAN AMERICAN', 'M'): (8.349869451697128,
  19.514360313315926),
 (50868, 'HISPANIC/LATINO', 'F'): (8.340909090909092, 19.613636363636363),
 (50868, 'HISPANIC/LATINO', 'M'): (8.331550802139038, 19.647058823529413),
 (50868, 'OTHER', 'F'): (8.414141414141413, 19.484848484848484),
 (50868, 'OTHER', 'M'): (8.288288288288289, 19.63963963963964),
 (50868, 'UNKNOWN', 'F'): (8.40552995391705, 19.52073732718894),
 (50868, 'UNKNOWN', 'M'): (8.519572953736654, 19.423487544483987),
 (50868, 'WHITE', 'F'): (8.352985074626865, 19.573880597014924),
 (50868, 'WHITE', 'M'): (8.332845647403072, 19.596927578639356),
 (50882, 'AMERICAN INDIAN/ALASKA NATIVE', 'F

In [307]:
# # Aggregate without splitting by Ethnicity/Gender

# lab_range_min_max = lab_events_sampled_sub_grouped.groupby('itemid').agg({'range_min' : 'mean', 'range_max' : 'mean'})
# lab_range_min_max.reset_index(inplace = True)

# lab_range_dic = dict(zip(lab_range_min_max['itemid'], zip(lab_range_min_max['range_min'], lab_range_min_max['range_max'])))
# lab_range_dic


Pivot the table so we have feature columns related to test results

In [431]:
lab_events_sampled_sub_grouped['new_index'] = list(zip(lab_events_sampled_sub_grouped['subject_id'], lab_events_sampled_sub_grouped['hadm_id']))


In [432]:
%time lab_events_sampled_pivot = lab_events_sampled_sub_grouped.pivot(index = 'new_index', columns = 'itemid', values = ['min', 'max', 'mean', 'abn_percent', 'below_min', 'above_max'])
# 'comments', 


CPU times: user 78.4 ms, sys: 17.8 ms, total: 96.2 ms
Wall time: 94.8 ms


In [433]:
lab_events_sampled_pivot = lab_events_sampled_pivot.reset_index()
lab_events_sampled_pivot['subject_id'], lab_events_sampled_pivot['hadm_id'] = zip(*lab_events_sampled_pivot['new_index'])
lab_events_sampled_pivot.drop(['new_index'], axis = 1, inplace = True)


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [434]:
missing_vals = lab_events_sampled_pivot.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()#.head(40)

             itemid
max          51221     299 
abn_percent  51221     299 
below_min    51221     299 
above_max    51221     299 
mean         51221     299 
min          51221     299 
above_max    51265     435 
abn_percent  51265     435 
below_min    51265     435 
max          51265     436 
min          51265     436 
mean         51265     436 
below_min    51222     496 
above_max    51222     496 
max          51222     496 
mean         51222     496 
abn_percent  51222     496 
min          51222     496 
mean         51301     497 
above_max    51301     497 
below_min    51301     497 
min          51301     497 
abn_percent  51301     497 
max          51301     497 
mean         51250     503 
max          51248     503 
             51249     503 
             51279     503 
abn_percent  51248     503 
above_max    51248     503 
             51249     503 
abn_percent  51250     503 
             51279     503 
mean         51279     503 
abn_percent  51277     503 


## Start Data Imputation

Copy table, retain original for Random Forest

In [435]:
lab_events_impute = lab_events_sampled_pivot.copy()

In [436]:
d_labitems[d_labitems['itemid'].isin(itemid_sub_sample)]

Unnamed: 0,itemid,label,fluid,category,loinc_code
115,50868,Anion Gap,Blood,Chemistry,1863-0
212,50882,Bicarbonate,Blood,Chemistry,1963-8
222,51464,Bilirubin,Urine,Hematology,5770-3
282,50893,"Calcium, Total",Blood,Chemistry,2000-8
442,50902,Chloride,Blood,Chemistry,2075-0
511,50911,"Creatine Kinase, MB Isoenzyme",Blood,Chemistry,6773-6
512,50912,Creatinine,Blood,Chemistry,2160-0
634,50920,Estimated GFR (MDRD equation),Blood,Chemistry,
723,50931,Glucose,Blood,Chemistry,6777-7
761,51221,Hematocrit,Blood,Hematology,4544-3


In [157]:
lab_events_impute

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,min,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,abn_percent,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,below_min,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,above_max,subject_id,hadm_id
itemid,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,50868,50882,50893,50902,50911,50912,50931,50960,50970,50971,50983,51003,51006,51221,51222,51248,51249,51250,51265,51274,51277,51279,51301,51464,Unnamed: 145_level_1,Unnamed: 146_level_1
0,15.0,14.0,7.7,95.0,44.0,1.1,165.0,1.7,3.0,3.5,129.0,0.69,30.0,33.1,11.1,30.4,32.1,92.0,112.0,11.8,12.3,3.59,11.0,0.0,23.0,22.0,8.7,103.0,170.0,1.6,370.0,2.4,4.1,4.8,137.0,6.74,45.0,43.0,14.3,31.8,34.5,97.0,216.0,18.9,13.2,4.58,36.8,0.0,18.900000,18.700000,8.270000,99.545455,112.166667,1.420000,252.900000,2.172727,3.530000,4.000000,133.090909,2.813333,40.000000,36.927273,12.330000,31.110000,33.400000,93.400000,168.181818,13.445455,12.760000,3.960000,21.620000,0.0,0.200000,0.900000,0.600000,0.090909,1.0,0.800000,1.000000,0.000000,0.000000,0.000000,0.363636,1.0,1.000000,0.818182,0.8,0.000,0.000000,0.0,0.272727,0.545455,0.0,1.0,1.000000,0.0,0.0,8.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,6.9,2.6,0.0,0.0,0.0,38.0,0.0,0.0,1.01,0.0,0.0,3.0,0.0,0.0,0.0,160.0,0.4,270.0,0.0,0.0,0.0,0.0,6.73,25.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,0.0,26.8,0.0,10002495,24982426.0
1,9.0,23.0,,103.0,,1.0,96.0,1.9,,4.2,139.0,,11.0,26.4,9.9,29.6,35.9,80.0,151.0,14.6,13.9,3.28,7.0,,13.0,31.0,,113.0,,1.6,138.0,2.3,,4.6,143.0,,18.0,34.8,12.2,30.7,38.4,83.0,184.0,16.6,14.7,3.98,12.4,,10.666667,27.250000,,107.500000,,1.233333,117.666667,2.100000,,4.416667,140.666667,,15.833333,30.800000,11.200000,30.140000,37.300000,81.200000,172.200000,15.600000,14.300000,3.706000,9.560000,,0.000000,0.000000,,0.500000,,0.333333,0.666667,0.000000,,0.000000,0.000000,,0.000000,1.000000,1.0,0.000,1.000000,0.6,0.000000,1.000000,0.0,1.0,0.200000,,0.0,0.0,,0.0,,0.0,0.0,0.0,,0.0,0.0,,0.0,13.6,4.1,0.0,0.0,2.0,0.0,0.0,0.0,1.32,0.0,,0.0,0.0,,5.0,,0.4,33.0,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,3.4,0.0,0.0,3.2,0.0,0.0,1.4,,10002527,29112696.0
2,12.0,25.0,8.5,91.0,3.0,1.1,116.0,2.0,3.3,3.7,130.0,,20.0,36.8,12.1,31.9,31.3,99.0,271.0,12.7,14.4,3.70,6.4,0.0,17.0,36.0,9.0,95.0,3.0,1.3,166.0,2.7,5.0,5.5,138.0,,37.0,41.2,13.4,32.8,32.8,102.0,460.0,12.7,15.1,4.15,19.3,0.0,14.888889,30.444444,8.766667,93.000000,3.000000,1.212500,144.500000,2.366667,4.066667,4.660000,133.100000,,29.250000,38.887500,12.600000,32.487500,32.312500,100.500000,354.125000,12.700000,14.800000,3.871250,10.850000,0.0,0.000000,0.222222,0.000000,1.000000,0.0,0.750000,1.000000,0.333333,0.333333,0.100000,0.400000,0.0,0.875000,0.000000,0.0,0.875,0.000000,1.0,0.125000,0.000000,0.0,1.0,0.375000,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.2,61.0,0.1,0.5,0.4,0.0,0.00,17.0,0.0,0.0,0.8,0.0,4.0,20.0,0.0,0.0,0.0,8.3,0.0,10005012,28371912.0
3,13.0,24.0,8.7,108.0,,0.6,93.0,2.0,3.1,4.2,141.0,,9.0,31.7,10.4,32.1,32.7,99.0,343.0,,12.0,3.22,6.0,,13.0,24.0,8.7,108.0,,0.6,93.0,2.0,3.1,4.2,141.0,,9.0,33.5,10.4,32.1,32.7,99.0,343.0,,12.0,3.22,6.0,,13.000000,24.000000,8.700000,108.000000,,0.600000,93.000000,2.000000,3.100000,4.200000,141.000000,,9.000000,32.600000,10.400000,32.100000,32.700000,99.000000,343.000000,,12.000000,3.220000,6.000000,,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,1.000000,1.0,1.000,0.000000,1.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,8.3,3.6,0.0,0.0,0.0,0.0,,0.0,1.38,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.1,0.0,1.0,0.0,,0.0,0.0,0.0,,10010264,26641707.0
4,11.0,29.0,8.8,101.0,,0.8,101.0,1.9,4.5,4.3,137.0,,9.0,41.7,13.7,30.6,32.8,93.0,320.0,11.6,14.3,4.47,14.2,,11.0,29.0,8.8,101.0,,0.8,101.0,1.9,4.5,4.3,137.0,,9.0,41.7,13.7,30.6,32.8,93.0,320.0,11.6,14.3,4.47,14.2,,11.000000,29.000000,8.800000,101.000000,,0.800000,101.000000,1.900000,4.500000,4.300000,137.000000,,9.000000,41.700000,13.700000,30.600000,32.800000,93.000000,320.000000,11.600000,14.300000,4.470000,14.200000,,0.000000,0.000000,0.000000,0.000000,,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,1.0,0.000,0.000000,0.0,0.000000,0.000000,0.0,1.0,1.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,,0.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2,,10010920,24676144.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8727,12.0,26.0,,105.0,3.0,1.1,105.0,,,3.9,139.0,,16.0,36.8,11.9,28.7,32.4,89.0,227.0,,12.0,4.15,4.6,,12.0,26.0,,105.0,3.0,1.1,105.0,,,3.9,139.0,,16.0,36.8,11.9,28.7,32.4,89.0,227.0,,12.0,4.15,4.6,,12.000000,26.000000,,105.000000,3.000000,1.100000,105.000000,,,3.900000,139.000000,,16.000000,36.800000,11.900000,28.700000,32.400000,89.000000,227.000000,,12.000000,4.150000,4.600000,,0.000000,0.000000,,0.000000,0.0,0.000000,0.000000,,,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.000,0.000000,0.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,,0.0,0.05,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,19993069,25459716.0
8728,12.0,26.0,9.1,104.0,,1.5,89.0,2.1,3.8,5.1,142.0,,21.0,36.8,11.9,32.2,32.3,100.0,201.0,,12.7,3.69,8.9,,12.0,26.0,9.1,104.0,,1.5,89.0,2.1,3.8,5.1,142.0,,21.0,37.6,11.9,32.2,32.3,100.0,201.0,,12.7,3.69,8.9,,12.000000,26.000000,9.100000,104.000000,,1.500000,89.000000,2.100000,3.800000,5.100000,142.000000,,21.000000,37.200000,11.900000,32.200000,32.300000,100.000000,201.000000,,12.700000,3.690000,8.900000,,0.000000,0.000000,0.000000,0.000000,,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,1.000000,1.000000,1.0,1.000,0.000000,1.0,0.000000,,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,3.2,1.8,0.0,0.0,0.0,0.0,,0.0,0.91,0.0,,0.0,0.0,0.0,0.0,,0.3,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.2,0.0,2.0,0.0,,0.0,0.0,0.0,,19995478,24108472.0
8729,10.0,16.0,7.5,86.0,,0.8,148.0,1.4,3.1,4.1,114.0,,13.0,23.1,7.7,26.7,33.3,77.0,226.0,13.2,11.7,2.85,7.9,0.0,18.0,23.0,8.7,97.0,,1.2,312.0,2.6,4.0,5.4,133.0,,20.0,30.6,10.5,27.3,35.0,81.0,334.0,14.2,12.6,3.87,11.7,0.0,14.055556,19.888889,8.293333,91.055556,,0.936842,215.666667,1.906667,3.433333,4.688889,125.222222,,16.888889,26.222222,8.977778,26.966667,34.211111,78.777778,280.555556,13.740000,12.233333,3.328889,9.500000,0.0,0.000000,0.611111,0.533333,0.888889,,0.000000,1.000000,0.066667,0.000000,0.000000,1.000000,,0.000000,1.000000,1.0,0.000,0.000000,1.0,0.000000,1.000000,0.0,1.0,0.333333,0.0,0.0,6.0,0.9,10.0,,0.0,0.0,0.2,0.0,0.0,21.0,,0.0,16.9,6.0,0.0,0.0,5.0,0.0,0.0,0.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,212.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7,0.0,0.0,1.7,0.0,19996783,25894657.0
8730,5.0,22.0,7.8,92.0,,0.5,76.0,1.8,1.3,2.8,131.0,,14.0,26.7,8.4,29.3,30.9,90.0,81.0,14.4,16.8,2.80,5.4,0.0,16.0,33.0,9.5,120.0,,0.8,225.0,2.8,5.1,5.1,155.0,,42.0,41.4,12.9,31.0,33.9,97.0,185.0,27.5,21.0,4.41,26.7,0.0,10.214286,27.642857,8.602857,99.190476,,0.611905,129.547619,2.139024,2.808571,4.364286,137.095238,,29.642857,30.673529,9.944118,30.038235,32.432353,92.735294,128.617647,17.682759,18.632353,3.312647,10.855882,0.0,0.404762,0.023810,0.285714,0.476190,,0.000000,0.904762,0.048780,0.485714,0.047619,0.500000,,0.809524,0.970588,1.0,0.000,0.235294,0.0,0.764706,1.000000,1.0,1.0,0.323529,0.0,5.0,0.0,0.6,4.0,,0.0,0.0,0.0,1.4,0.7,4.0,,0.0,13.3,5.3,0.0,1.1,0.0,69.0,0.0,0.0,1.80,0.0,0.0,0.0,1.0,0.0,12.0,,0.0,125.0,0.2,0.6,0.0,8.0,,22.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,5.5,0.0,16.7,0.0,19997886,20793010.0


In [437]:
lab_events_impute = lab_events_impute.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')



In [438]:
%%time

np.random.seed(0)

for labitem in lab_range_eg_dic:
    for ind in lab_events_impute.loc[(lab_events_impute[('mean', labitem[0])].isnull()) & (lab_events_impute['ethnicity'] == labitem[1]) & (lab_events_impute['gender'] == labitem[2])].index:
        val_max = lab_range_eg_dic[labitem][1]
        val_min = lab_range_eg_dic[labitem][0]

        val_ave = (val_max + val_min) / 2
        val_std = (val_max - val_ave) * .333

        ran_vals = np.random.normal(val_ave, val_std, 50)
        impute_min = min(ran_vals)
        impute_max = max(ran_vals)
        impute_mean = np.mean(ran_vals)
        
        val_min = lab_events_impute[('min', labitem[0])][ind]
        val_max = lab_events_impute[('max', labitem[0])][ind]
        val_mean = lab_events_impute[('mean', labitem[0])][ind]
        
        lab_events_impute[('min', labitem[0])][ind] = np.where(np.isnan(val_min), impute_min, val_min)
        lab_events_impute[('max', labitem[0])][ind] = np.where(np.isnan(val_max), impute_max, val_max)
        lab_events_impute[('mean', labitem[0])][ind] = np.where(np.isnan(val_mean), impute_mean, val_mean)
#         patient[('mean', labitem)].replace(np.nan, impute_mean)
#         patient[('abn_count', labitem)].replace(np.nan, 0)
# 5k patients
# 10 items - 2.5s
# 20 items - 6s
# 30 items - 28.6s

# 10k patients
# 40 items - 1m 34s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 12.4 s, sys: 48 ms, total: 12.4 s
Wall time: 12.5 s


In [439]:
for lab_itemid in lab_range_eg_dic:
    lab_events_impute[( 'above_max', lab_itemid[0])] = lab_events_impute[( 'above_max', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute[( 'below_min', lab_itemid[0])] = lab_events_impute[( 'below_min', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute[( 'abn_percent', lab_itemid[0])] = lab_events_impute[( 'abn_percent', lab_itemid[0])].replace(np.nan, 0)
#     lab_events_sampled_pivot[( 'comments', lab_itemid)] = lab_events_sampled_pivot[( 'comments', lab_itemid)].replace(np.nan, np.array(np.nan))


In [440]:
missing_vals = lab_events_impute.isnull().sum(axis = 0) 
missing_vals[missing_vals > 0].sort_values()

Series([], dtype: int64)

In [441]:
lab_events_impute_KNN = lab_events_sampled_pivot.copy()

In [442]:
lab_events_impute_KNN = lab_events_impute_KNN.merge(admissions_sample, left_on = 'hadm_id', right_on = 'hadm_id')

dummy = pd.get_dummies(lab_events_impute_KNN['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
lab_events_impute_KNN = lab_events_impute_KNN.drop('ethnicity', axis=1)
lab_events_impute_KNN = pd.concat([lab_events_impute_KNN, dummy], axis=1)

dummy = pd.get_dummies(lab_events_impute_KNN['insurance'], prefix='insurance')
dummy.drop('insurance_Other', axis=1, inplace=True)
lab_events_impute_KNN = lab_events_impute_KNN.drop('insurance', axis=1)
lab_events_impute_KNN = pd.concat([lab_events_impute_KNN, dummy], axis=1)


lab_events_impute_KNN['gender'] = lab_events_impute_KNN['gender'].map({'M' : 0.0, 'F' : 1.0})
lab_events_impute_KNN['gender'] = pd.to_numeric(lab_events_impute_KNN['gender'])


In [443]:
%%time

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=math.trunc((lab_events_impute_KNN.shape[0] ** 0.5)))
lab_events_impute_KNN_imputed = imputer.fit_transform(lab_events_impute_KNN)

# 3min 8s

CPU times: user 57.9 s, sys: 12.3 s, total: 1min 10s
Wall time: 1min 5s


In [444]:
lab_events_impute_KNN_imputed = pd.DataFrame(lab_events_impute_KNN_imputed)
lab_events_impute_KNN_imputed.columns = lab_events_impute_KNN.columns

In [445]:
for lab_itemid in lab_range_eg_dic:
    lab_events_impute_KNN_imputed[( 'above_max', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'above_max', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute_KNN_imputed[( 'below_min', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'below_min', lab_itemid[0])].replace(np.nan, 0)
    lab_events_impute_KNN_imputed[( 'abn_percent', lab_itemid[0])] = lab_events_impute_KNN_imputed[( 'abn_percent', lab_itemid[0])].replace(np.nan, 0)
#     lab_events_sampled_pivot[( 'comments', lab_itemid)] = lab_events_sampled_pivot[( 'comments', lab_itemid)].replace(np.nan, np.array(np.nan))


## Standardize the Data for use in Models

In [446]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [447]:
labs_scaled = lab_events_impute.copy()
labs_scaled_KNN = lab_events_impute_KNN_imputed.copy()
labs_scaled_missing_data = lab_events_sampled_pivot.copy()

In [448]:
labs_scaled_missing_data = labs_scaled_missing_data.merge(admissions_sample, how='left', left_on='hadm_id', right_on='hadm_id')



In [449]:
scale_cols = ~labs_scaled.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'ethnicity', 'gender', 'insurance'])

scaler.fit(labs_scaled.loc[:, scale_cols])
labs_scaled.loc[:, scale_cols] = scaler.transform(labs_scaled.loc[:, scale_cols])

scale_cols = ~labs_scaled_KNN.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_ASIAN', 'ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_HISPANIC/LATINO', 'ethnicity_OTHER', 'ethnicity_UNKNOWN', 'insurance_Medicare','insurance_Medicaid'])

scaler.fit(labs_scaled_KNN.loc[:, scale_cols])
labs_scaled_KNN.loc[:, scale_cols] = scaler.transform(labs_scaled_KNN.loc[:, scale_cols])

scale_cols = ~labs_scaled_missing_data.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'ethnicity', 'gender','insurance'])

scaler.fit(labs_scaled_missing_data.loc[:, scale_cols])
labs_scaled_missing_data.loc[:, scale_cols] = scaler.transform(labs_scaled_missing_data.loc[:, scale_cols])


## Save Lab Results for Models

In [450]:
# Save non-imputed file as well for forest models

pos_final_test = labs_scaled.copy()
pos_KNN_final_test = labs_scaled_KNN.copy()
pos_final_missing_data_test = labs_scaled_missing_data.copy()


In [451]:
dummy = pd.get_dummies(pos_final_test['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
pos_final_test = pos_final_test.drop('ethnicity', axis=1)
pos_final_test = pd.concat([pos_final_test, dummy], axis=1)

pos_final_test['gender'] = pos_final_test['gender'].map({'M' : 0.0, 'F' : 1.0})
pos_final_test['gender'] = pd.to_numeric(pos_final_test['gender'])



dummy = pd.get_dummies(pos_final_missing_data_test['ethnicity'], prefix='ethnicity')
dummy.drop('ethnicity_WHITE', axis=1, inplace=True)
pos_final_missing_data_test = pos_final_missing_data_test.drop('ethnicity', axis=1)
pos_final_missing_data_test = pd.concat([pos_final_missing_data_test, dummy], axis=1)

pos_final_missing_data_test['gender'] = pos_final_missing_data_test['gender'].map({'M' : 0.0, 'F' : 1.0})
pos_final_missing_data_test['gender'] = pd.to_numeric(pos_final_missing_data_test['gender'])

In [452]:
labitem_names = d_labitems.loc[d_labitems['itemid'].isin(itemid_sub_sample)].copy()
labitem_names['Short Name'] = labitem_names['label'] + ' - ' + labitem_names['fluid'] + ' - ' + labitem_names['category']

labitem_names = labitem_names[['itemid', 'Short Name']]
labitem_names.set_index('itemid', inplace = True)
labitem_dict = labitem_names.to_dict()['Short Name']


In [453]:
def rename_lab_col(table_name):
    num_cols = ~table_name.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_ASIAN', 'ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_HISPANIC/LATINO', 'ethnicity_OTHER', 'ethnicity_UNKNOWN', 'age','ed_length', 'insurance_Medicare','insurance_Medicaid'])

    col_names = []
    col_names = list(table_name.loc[:, num_cols].columns)

    x, y = zip(*col_names)
    col_names = list(zip(x, list(map(labitem_dict.get, y))))

    rename_dict = dict(zip(table_name.loc[:, num_cols].columns, col_names))

    table_name.rename(columns = rename_dict, inplace = True)


In [454]:
rename_lab_col(pos_final_test)
rename_lab_col(pos_KNN_final_test)
rename_lab_col(pos_final_missing_data_test)

In [455]:
pos_final_test.to_csv('data/pos_final_test.csv.gz', compression="gzip")
pos_KNN_final_test.to_csv('data/pos_KNN_final_test.csv.gz', compression="gzip")
pos_final_missing_data_test.to_csv('data/pos_final_missing_data_test.csv.gz', compression="gzip")

## Check for Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
%%time

num_cols = ~labs_scaled.columns.isin([('subject_id', ''), ('hadm_id', ''), 'subject_id', 'hadm_id', 'pos', 'gender', 'ethnicity', 'age'])

labs_scaled_features = labs_scaled.loc[:, num_cols]
add_constant(labs_scaled_features)

high_vif = pd.Series([variance_inflation_factor(labs_scaled_features.values, i) 
               for i in range(labs_scaled_features.shape[1])], 
              index=labs_scaled_features.columns).sort_values(ascending = False)

high_vif.head(20)


In [None]:
labs_scaled_features_limited = labs_scaled_features.copy()

labs_scaled_features_limited.drop([('max', 50931),
                                   ('mean', 51222),
                                   ('max', 50912),
                                   ('min', 51221),
                                   ('mean', 50912),
                                   ('mean', 51265),
                                   ('max', 51006),
                                   ('mean', 51221)], axis = 1, inplace = True)

# # Top 40 labs
# labs_scaled_features_limited.drop([('above_max', 51484),
#                                   ('above_max', 51478),
#                                   ('above_max', 51466),
#                                   ('above_max', 51464),
#                                   ('max', 51003),
#                                   ('max', 50878),
#                                   ('max', 50861),
#                                   ('max', 51237),
#                                   ('mean', 51248),
#                                   ('max', 50931),
#                                   ('mean', 51222),
#                                   ('max', 51274),
#                                   ('mean', 51279),
#                                   ('mean', 51274),
#                                   ('mean', 51478),
#                                   ('mean', 50885),
#                                   ('mean', 51250),
#                                   ('max', 50911),
#                                   ('mean', 51003),
#                                   ('mean', 51466),
#                                   ('mean', 51484),
#                                   ('mean', 51486),
#                                   ('mean', 51277),
#                                   ('mean', 51221),
#                                   ('max', 50912),
#                                   ('max', 51275),
#                                   ('mean', 51492),
#                                   ('mean', 51514),
#                                   ('mean', 51006),
#                                   ('min', 51222),
#                                   ('max', 50960),
#                                   ('mean', 51249),
#                                   ('max', 50885),
#                                   ('mean', 51506),
#                                   ('max', 51248),
#                                   ('min', 51279),
#                                   ('mean', 50912),
#                                   ('mean', 51265),
#                                   ('mean', 51464),
#                                   ('min', 51248),
#                                   ('max', 51222),
#                                   ('max', 51301),
#                                   ('mean', 51487),
#                                   ('below_min', 51222),
#                                   ('max', 51006),
#                                   ('mean', 50902),
#                                   ('mean', 50911),
#                                   ('above_max', 50861),
#                                   ('mean', 50882),
#                                   ('below_min', 51279),
#                                   ('abn_percent', 51274),
#                                   ('mean', 51301),
#                                   ('mean', 50893),
#                                   ('max', 50863),
#                                   ('min', 51250),
#                                   ('mean', 50983),
#                                   ('mean', 50868),
#                                   ('mean', 50970),
#                                   ('mean', 51237),
#                                   ('mean', 50971),
#                                   ('mean', 51498),
#                                   ('mean', 51491),
#                                   ('max', 51277),
#                                   ('mean', 50878),
#                                   ('above_max', 51237),
#                                   ('mean', 51275),
#                                   ('min', 51221),
#                                   ('max', 51279),
#                                   ('below_min', 51248),
#                                   ('min', 50902),
#                                   ('mean', 50863),
#                                   ('mean', 50931),
#                                   ('max', 50902),
#                                   ('below_min', 51514),
#                                   ('max', 51466),
#                                   ('min', 50893),
#                                   ('above_max', 50912),
#                                   ('max', 51265),
#                                   ('min', 50882),
#                                   ('max', 50970),
#                                   ('mean', 50960),
#                                   ('min', 51249),
#                                   ('min', 50983),
#                                   ('min', 51237),
#                                   ('above_max', 51003),
#                                   ('above_max', 50885),
#                                   ('abn_percent', 51466),
#                                   ('max', 51486)], axis = 1, inplace = True)


In [None]:
%%time

high_vif = pd.Series([variance_inflation_factor(labs_scaled_features_limited.values, i) 
               for i in range(labs_scaled_features_limited.shape[1])], 
              index=labs_scaled_features_limited.columns).sort_values(ascending = False)

high_vif.head(20)
