In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import importlib
import ipywidgets as widgets
from pathlib import Path

module_path='preprocessing'
if module_path not in sys.path:
    sys.path.append(module_path)

module_path='utils'
if module_path not in sys.path:
    sys.path.append(module_path)

root_dir = os.path.dirname(os.path.abspath('mainPipeline.ipynb'))

import day_intervals_cohort_v2
from day_intervals_cohort_v2 import *

import feature_selection_hosp
from feature_selection_hosp import *

## 1. DATA EXTRACTION
Please run below cell to select option for cohort selection.
The cohort will be svaed in **./data/cohort/**

In [2]:
version = 'Version 2'
radio_input4 = 'Phenotype'

### Refining Cohort and Prediction Task Definition

Based on your current selection following block will provide option to further refine prediction task and cohort associated with it:

- First you will refine the prediction task choosing from following options -
    - **Phenotype Prediction** - You can select from four major chronic diseases to predict its future outcome

        - Heart failure
        - CAD (Coronary Artery Disease)
        - CKD (Chronic Kidney Disease)
        - COPD (Chronic obstructive pulmonary disease)

- Second, you will choode whether to perfom above task using ICU or non-ICU admissions data

- Third, you can refine the refine the cohort selection for any of the above choosen prediction tasks by including the admission samples admitted with particular chronic disease - 
    - Heart failure
    - CAD (Coronary Artery Disease)
    - CKD (Chronic Kidney Disease)
    - COPD (Chronic obstructive pulmonary disease)
    
print("**Please run below cell to extract the cohort for selected options**")

In [3]:
radio_input2 = widgets.RadioButtons(options=['Heart Failure in 30 days','CAD in 30 days','CKD in 30 days','COPD in 30 days','Diabetes in 30 days'],value='Heart Failure in 30 days')
display(radio_input2)

print("Extract Data")
radio_input1 = 'Non-ICU'

print("Please select if you want to perform choosen prediction task for a specific disease.")
radio_input3 = widgets.RadioButtons(options=['No Disease Filter','Heart Failure','CKD','CAD','COPD'],value='No Disease Filter')
display(radio_input3)

RadioButtons(options=('Heart Failure in 30 days', 'CAD in 30 days', 'CKD in 30 days', 'COPD in 30 days', 'Diab…

Extract Data
Please select if you want to perform choosen prediction task for a specific disease.


RadioButtons(options=('No Disease Filter', 'Heart Failure', 'CKD', 'CAD', 'COPD'), value='No Disease Filter')

In [4]:
disease_label=""
time=0

if radio_input2.value=='Heart Failure in 30 days':
    label='Readmission'
    time=30
    disease_label='I50'
elif radio_input2.value=='CAD in 30 days':
    label='Readmission'
    time=30
    disease_label='I25'
elif radio_input2.value=='CKD in 30 days':
    label='Readmission'
    time=30
    disease_label='N18'
elif radio_input2.value=='COPD in 30 days':
    label='Readmission'
    time=30
    disease_label='J44'
elif radio_input2.value=='Diabetes in 30 days':
    label='Readmission'
    time=30
    disease_label='E11'
    
data_icu=False
data_mort=label=="Mortality"
data_admn=label=='Readmission'
data_los=label=='Length of Stay'
        
if (radio_input3.value=="Heart Failure"):
    icd_code='I50'
elif (radio_input3.value=="CKD"):
    icd_code='N18'
elif (radio_input3.value=="COPD"):
    icd_code='J44'
elif (radio_input3.value=="CAD"):
    icd_code='I25'
else:
    icd_code='No Disease Filter'

version_path="mimiciv/2.2"
cohort_output = day_intervals_cohort_v2.extract_data(radio_input1,label,time,icd_code, root_dir,disease_label)

EXTRACTING FOR: | NON-ICU | READMISSION DUE TO E11 | 30 | 
[ READMISSION DUE TO E11 ]


100%|██████████| 34755/34755 [06:49<00:00, 84.86it/s] 


[ READMISSION LABELS FINISHED ]
[ COHORT SUCCESSFULLY SAVED ]
[ SUMMARY SUCCESSFULLY SAVED ]
Readmission FOR Non-ICU DATA
# Admission Records: 97965
# Patients: 34755
# Positive cases: 20262
# Negative cases: 77703


## 2. FEATURE SELECTION
Features available for Non-ICU data -
- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)
- Procedures (https://mimic.mit.edu/docs/iv/modules/hosp/procedures_icd/)
- Medications (https://mimic.mit.edu/docs/iv/modules/hosp/prescriptions/)
- Lab Events (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/)

All features will be saved in **./data/features/**

**Please run below cell to select features**

In [5]:
diag_flag=True
lab_flag=True
proc_flag=True
med_flag=True
feature_nonicu(cohort_output, version_path,diag_flag,lab_flag,proc_flag,med_flag)

[EXTRACTING DIAGNOSIS DATA]


100%|██████████| 6258/6258 [00:15<00:00, 399.61it/s]


# unique ICD-9 codes 6258
# unique ICD-10 codes 9534
# unique ICD-10 codes (After converting ICD-9 to ICD-10) 9838
# unique ICD-10 codes (After clinical gruping ICD-10 codes) 1457
# Admissions:   97965
[SUCCESSFULLY SAVED DIAGNOSIS DATA]
[EXTRACTING PROCEDURES DATA]
# Unique ICD9 Procedures:   1861
# Unique ICD10 Procedures:  5183

Value counts of each ICD version:
 9     110628
10     54166
Name: icd_version, dtype: int64
# Admissions:   53850
Total number of rows:  164794
[SUCCESSFULLY SAVED PROCEDURES DATA]
[EXTRACTING MEDICATIONS DATA]
Number of unique type of drug:  2012
Number of unique type of drug (after grouping to use Non propietary names):  970
Total number of rows:  2580611
# Admissions:   89550
[SUCCESSFULLY SAVED MEDICATIONS DATA]
[EXTRACTING LABS DATA]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

# Itemid:  556
# Admissions:  95763
Total number of rows:  17567713
[SUCCESSFULLY SAVED LABS DATA]


## 3. CLINICAL GROUPING
Below you will have option to clinically group diagnosis and medications.
Grouping medical codes will reduce dimensional space of features.

Default options selected below will group medical codes to reduce feature dimension space.

**Please run below cell to select preprocessing for diferent features**

In [6]:
if diag_flag:
    print("Do you want to group ICD 10 DIAG codes ?")
    radio_input4 = widgets.RadioButtons(options=['Keep both ICD-9 and ICD-10 codes','Convert ICD-9 to ICD-10 codes','Convert ICD-9 to ICD-10 and group ICD-10 codes'],value='Convert ICD-9 to ICD-10 and group ICD-10 codes',layout={'width': '100%'})
    display(radio_input4)     
if med_flag:
    print("Do you want to group Medication codes to use Non propietary names?")
    radio_input5 = widgets.RadioButtons(options=['Yes','No'],value='Yes',layout={'width': '100%'})
    display(radio_input5)
if proc_flag:
    print("Which ICD codes for Procedures you want to keep in data?")
    radio_input6 = widgets.RadioButtons(options=['ICD-9 and ICD-10','ICD-10'],value='ICD-10',layout={'width': '100%'})
    display(radio_input6)
print("**Please run below cell to perform feature preprocessing**")

Do you want to group ICD 10 DIAG codes ?


RadioButtons(index=2, layout=Layout(width='100%'), options=('Keep both ICD-9 and ICD-10 codes', 'Convert ICD-9…

Do you want to group Medication codes to use Non propietary names?


RadioButtons(layout=Layout(width='100%'), options=('Yes', 'No'), value='Yes')

Which ICD codes for Procedures you want to keep in data?


RadioButtons(index=1, layout=Layout(width='100%'), options=('ICD-9 and ICD-10', 'ICD-10'), value='ICD-10')

**Please run below cell to perform feature preprocessing**


In [7]:
group_diag=False
group_med=False
group_proc=False
if diag_flag:
    group_diag=radio_input4.value
if med_flag:
    group_med=radio_input5.value
if proc_flag:
    group_proc=radio_input6.value
preprocess_features_hosp(cohort_output, diag_flag,proc_flag,med_flag,False,group_diag,group_med,group_proc,False,False,0,0)

[PROCESSING DIAGNOSIS DATA]
Total number of rows 1398874
[SUCCESSFULLY SAVED DIAGNOSIS DATA]
[PROCESSING MEDICATIONS DATA]
Total number of rows 2575767
[SUCCESSFULLY SAVED MEDICATIONS DATA]
[PROCESSING PROCEDURES DATA]
Total number of rows 54166
[SUCCESSFULLY SAVED PROCEDURES DATA]


### 4. SUMMARY OF FEATURES

This step will generate summary of all features extracted so far.<br>
It will save summary files in **./data/summary/**<br>
- These files provide summary about **mean frequency** of medical codes per admission.<br>
- It also provides **total occurrence count** of each medical code.<br>
- For labs and chart events it will also provide <br>**missing %** which tells how many rows for a certain medical code has missing value.

Please use this information to further refine your cohort by selecting <br>which medical codes in each feature you want to keep and <br>which codes you would like to remove for downstream analysis tasks.

**Please run below cell to generate summary files**

In [8]:
generate_summary_hosp(diag_flag,proc_flag,med_flag,lab_flag)

[GENERATING FEATURE SUMMARY]


2it [00:19,  9.67s/it]


[SUCCESSFULLY SAVED FEATURE SUMMARY]


## 5. Feature Selection

based on the files generated in previous step and other infromation gathered by you,<br>
Please select which medical codes you want to include in this study.

Please run below cell to to select options for which features you want to perform feature selection.

- Select **Yes** if you want to select a subset of medical codes for that feature and<br> **edit** the corresponding feature file for it.
- Select **No** if you want to keep all the codes in a feature.

In [9]:
if diag_flag:
    print("Do you want to do Feature Selection for Diagnosis \n (If yes, please edit list of codes in ./data/summary/diag_features.csv)")
    radio_input4 = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(radio_input4)         
if med_flag:
    print("Do you want to do Feature Selection for Medication \n (If yes, please edit list of codes in ./data/summary/med_features.csv)")
    radio_input5 = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(radio_input5)   
if proc_flag:
    print("Do you want to do Feature Selection for Procedures \n (If yes, please edit list of codes in ./data/summary/proc_features.csv)")
    radio_input6 = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(radio_input6)   
if lab_flag:
    print("Do you want to do Feature Selection for Labs \n (If yes, please edit list of codes in ./data/summary/lab_features.csv)")
    radio_input7 = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(radio_input7)   
print("**Please run below cell to perform feature selection**")

Do you want to do Feature Selection for Diagnosis 
 (If yes, please edit list of codes in ./data/summary/diag_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Medication 
 (If yes, please edit list of codes in ./data/summary/med_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Procedures 
 (If yes, please edit list of codes in ./data/summary/proc_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Labs 
 (If yes, please edit list of codes in ./data/summary/lab_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

**Please run below cell to perform feature selection**


In [10]:
select_diag=False
select_med=False
select_proc=False
select_lab=False
select_out=False
select_chart=False

if diag_flag:
    select_diag=radio_input4.value == 'Yes'
if med_flag:
    select_med=radio_input5.value == 'Yes'
if proc_flag:
    select_proc=radio_input6.value == 'Yes'
if lab_flag:
    select_lab=radio_input7.value == 'Yes'
features_selection_hosp(cohort_output, diag_flag,proc_flag,med_flag,lab_flag,select_diag,select_med,select_proc,select_lab)

## 6. CLEANING OF FEATURES
Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.

Outlier removal is performed to remove values higher than selected **right threshold** percentile and lower than selected **left threshold** percentile among all values for each itemid. 

**Please run below cell to select preprocessing for diferent features**

In [11]:
if lab_flag:
    print("Outlier removal in values of lab events ?")
    layout = widgets.Layout(width='100%', height='40px') #set width and height

    radio_input7 = widgets.RadioButtons(options=['No outlier detection','Impute Outlier (default:98)','Remove outliers (default:98)'],value='No outlier detection',layout=layout)
    display(radio_input7)
    outlier=widgets.IntSlider(
    value=98,
    min=90,
    max=99,
    step=1,
    disabled=False,layout={'width': '100%'}
    )
    left_outlier=widgets.IntSlider(
    value=0,
    min=0,
    max=10,
    step=1,
    disabled=False,layout={'width': '100%'}
    )
    #display(oulier)
    display(widgets.HBox([widgets.Label('Right Outlier Threshold',layout={'width': '150px'}), outlier]))
    display(widgets.HBox([widgets.Label('Left Outlier Threshold',layout={'width': '150px'}), left_outlier]))
print("**Please run below cell to perform feature preprocessing**")

Outlier removal in values of lab events ?


RadioButtons(layout=Layout(height='40px', width='100%'), options=('No outlier detection', 'Impute Outlier (def…

HBox(children=(Label(value='Right Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=98, layou…

HBox(children=(Label(value='Left Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=0, layout=…

**Please run below cell to perform feature preprocessing**


In [12]:
thresh=0
if lab_flag:
    clean_lab=radio_input7.value!='No outlier detection'
    impute_outlier=radio_input7.value=='Impute Outlier (default:98)'
    thresh=outlier.value
    left_thresh=left_outlier.value
preprocess_features_hosp(cohort_output, False,False,False,lab_flag,False,False,False,clean_lab,impute_outlier,thresh,left_thresh)

## 7. Add diag names and lab names

In [13]:
read_dir = './data/features/'
save_dir = './processed/'

### cohort

In [14]:
cohort = pd.read_csv('./data/cohort/cohort_non-icu_readmission_30_E11.csv.gz', compression='gzip')
cohort.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97965 entries, 0 to 97964
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   subject_id  97965 non-null  float64
 1   hadm_id     97965 non-null  float64
 2   admittime   97965 non-null  object 
 3   dischtime   97965 non-null  object 
 4   Age         97965 non-null  float64
 5   gender      97965 non-null  object 
 6   ethnicity   97965 non-null  object 
 7   insurance   97965 non-null  object 
 8   label       97965 non-null  int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 6.7+ MB


In [15]:
def split_df(dataframe):
    total_rows = len(dataframe)
    train_end = int(total_rows * 0.6)
    validation_end = train_end + int(total_rows * 0.2)

    dataframe['split'] = 'train'  # Default to 'train'
    dataframe.loc[train_end:validation_end-1, 'split'] = 'validation'
    dataframe.loc[validation_end:, 'split'] = 'test'
    return dataframe

cohort_new = cohort.replace({
    'WHITE - RUSSIAN': 'WHITE',
    'WHITE - OTHER EUROPEAN': 'WHITE',
    'HISPANIC/LATINO - PUERTO RICAN': 'HISPANIC OR LATINO',
    'ASIAN - CHINESE': 'ASIAN',
    'BLACK/CAPE VERDEAN': 'BLACK/AFRICAN AMERICAN',
    'HISPANIC/LATINO - DOMINICAN': 'HISPANIC OR LATINO',
    'BLACK/CARIBBEAN ISLAND': 'BLACK/AFRICAN AMERICAN',
    'BLACK/AFRICAN': 'BLACK/AFRICAN AMERICAN',
    'PORTUGUESE': 'WHITE',
    'UNABLE TO OBTAIN': 'UNKNOWN',
    'WHITE - EASTERN EUROPEAN': 'WHITE',
    'ASIAN - SOUTH EAST ASIAN': 'ASIAN',                       
    'HISPANIC/LATINO - GUATEMALAN': 'HISPANIC OR LATINO',
    'PATIENT DECLINED TO ANSWER': 'UNKNOWN',                     
    'AMERICAN INDIAN/ALASKA NATIVE': 'OTHER',                  
    'ASIAN - ASIAN INDIAN': 'ASIAN',                           
    'HISPANIC/LATINO - SALVADORAN': 'HISPANIC OR LATINO',
    'HISPANIC/LATINO - CUBAN': 'HISPANIC OR LATINO',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'HISPANIC OR LATINO',
    'HISPANIC/LATINO - HONDURAN': 'HISPANIC OR LATINO',
    'HISPANIC/LATINO - COLUMBIAN': 'HISPANIC OR LATINO',
    'HISPANIC/LATINO - MEXICAN': 'HISPANIC OR LATINO',
    'WHITE - BRAZILIAN': 'WHITE',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'OTHER',       
    'SOUTH AMERICAN': 'HISPANIC OR LATINO',                                  
    'ASIAN - KOREAN': 'ASIAN',                                  
    'MULTIPLE RACE/ETHNICITY': 'OTHER',                         
})
cohort_new = cohort_new[(cohort_new.ethnicity=='WHITE')|(cohort_new.ethnicity=='BLACK/AFRICAN AMERICAN')]
cohort_new['insurance_01'] = cohort_new['insurance']=='Other'
split_dfs = []
for gender in ['F', 'M']:
    for label in [0, 1]:
        for ethnicity in ['WHITE', 'BLACK/AFRICAN AMERICAN']:
            for insurance_01 in [True, False]:
                subset = cohort_new[(cohort_new['gender'] == gender) & 
                                (cohort_new['label'] == label) & 
                                (cohort_new['ethnicity'] == ethnicity) & 
                                (cohort_new['insurance_01'] == insurance_01)]
                if not subset.empty:
                    split_dfs.append(split_df(subset.copy().reset_index()))
cohort_new = pd.concat(split_dfs).reset_index()                
cohort_new = cohort_new.drop(columns=['level_0', 'index', 'Age', 'insurance_01'])
print(cohort_new.split.value_counts())
cohort_new.sample(5)

train         48590
test          16218
validation    16191
Name: split, dtype: int64


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,ethnicity,insurance,label,split
52588,12064199.0,20575220.0,2177-07-28 07:15:00,2177-08-03 13:25:00,M,WHITE,Medicare,0,train
45239,16335352.0,20728992.0,2193-01-17 16:36:00,2193-01-18 15:15:00,M,WHITE,Other,0,validation
47787,18541554.0,24582627.0,2114-07-05 21:32:00,2114-07-09 18:20:00,M,WHITE,Other,0,test
65630,12522208.0,26224516.0,2155-04-27 08:37:00,2155-05-17 16:21:00,M,BLACK/AFRICAN AMERICAN,Other,0,train
75165,11494753.0,29021708.0,2126-08-31 03:33:00,2126-09-04 11:00:00,M,WHITE,Medicare,1,train


In [16]:
cohort_new.to_csv(save_dir + 'cohort.csv')
cohort_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80999 entries, 0 to 80998
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   subject_id  80999 non-null  float64
 1   hadm_id     80999 non-null  float64
 2   admittime   80999 non-null  object 
 3   dischtime   80999 non-null  object 
 4   gender      80999 non-null  object 
 5   ethnicity   80999 non-null  object 
 6   insurance   80999 non-null  object 
 7   label       80999 non-null  int64  
 8   split       80999 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.6+ MB


### Diag

In [17]:
diag = pd.read_csv(read_dir + 'preproc_diag.csv.gz', compression='gzip')
diag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1398874 entries, 0 to 1398873
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   subject_id    1398874 non-null  int64 
 1   hadm_id       1398874 non-null  int64 
 2   new_icd_code  1398874 non-null  object
dtypes: int64(2), object(1)
memory usage: 32.0+ MB


In [18]:
icd_map = pd.read_csv('./mimiciv/2.2/hosp/d_icd_diagnoses.csv.gz', compression='gzip')
icd_map = icd_map[icd_map.icd_version==10]

diag_new = pd.merge(diag, icd_map, left_on='new_icd_code', right_on='icd_code', how='left')
diag_new = diag_new[['subject_id','hadm_id','new_icd_code','long_title']]
diag_new = diag_new[diag_new.hadm_id.isin(cohort_new.hadm_id)]
diag_new.sample(5)

Unnamed: 0,subject_id,hadm_id,new_icd_code,long_title
222378,11582633,25206766,F32,"Major depressive disorder, single episode"
1114873,17968595,29041322,K21,Gastro-esophageal reflux disease
32045,10252385,21057326,G43,Migraine
1258704,18974079,27761005,I25,Chronic ischemic heart disease
797948,15661244,27008859,E66,Overweight and obesity


In [19]:
diag_new.to_csv(save_dir + 'diag.csv')
diag_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1169248 entries, 0 to 1398873
Data columns (total 4 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   subject_id    1169248 non-null  int64 
 1   hadm_id       1169248 non-null  int64 
 2   new_icd_code  1169248 non-null  object
 3   long_title    1169248 non-null  object
dtypes: int64(2), object(2)
memory usage: 44.6+ MB


### Labs

In [20]:
labs = pd.read_csv(read_dir + 'preproc_labs.csv.gz', compression='gzip')
labs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17566395 entries, 0 to 17566394
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   subject_id           int64  
 1   hadm_id              float64
 2   charttime            object 
 3   itemid               int64  
 4   admittime            object 
 5   lab_time_from_admit  object 
 6   valuenum             float64
dtypes: float64(2), int64(2), object(3)
memory usage: 938.1+ MB


In [21]:
lab_map = pd.read_csv('./mimiciv/2.2/hosp/d_labitems.csv.gz', compression='gzip')

labs_new = pd.merge(labs, lab_map, on='itemid', how='left')
labs_new = labs_new[['subject_id','hadm_id','itemid','lab_time_from_admit','valuenum','label','fluid','category']]
labs_new = labs_new[labs_new.hadm_id.isin(cohort_new.hadm_id)]
labs_new.sample(5)

Unnamed: 0,subject_id,hadm_id,itemid,lab_time_from_admit,valuenum,label,fluid,category
15917596,19052360,28611373.0,51301,0 days 18:04:00,9.5,White Blood Cells,Blood,Hematology
5508339,13140343,27308490.0,50960,3 days 06:43:00,1.8,Magnesium,Blood,Chemistry
13317149,17529736,23557183.0,50947,-1 days +10:59:00,7.0,I,Blood,Chemistry
15517526,18810350,26619418.0,51274,0 days 16:05:00,20.6,PT,Blood,Hematology
7096635,14001336,22216032.0,51250,8 days 07:29:00,100.0,MCV,Blood,Hematology


In [22]:
labs_new.to_csv(save_dir + 'labs.csv')
labs_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14321645 entries, 0 to 17566394
Data columns (total 8 columns):
 #   Column               Dtype  
---  ------               -----  
 0   subject_id           int64  
 1   hadm_id              float64
 2   itemid               int64  
 3   lab_time_from_admit  object 
 4   valuenum             float64
 5   label                object 
 6   fluid                object 
 7   category             object 
dtypes: float64(2), int64(2), object(4)
memory usage: 983.4+ MB


### Proc

In [23]:
proc = pd.read_csv(read_dir + 'preproc_proc.csv.gz', compression='gzip')
proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54166 entries, 0 to 54165
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   subject_id            54166 non-null  int64 
 1   hadm_id               54166 non-null  int64 
 2   icd_code              54166 non-null  object
 3   chartdate             54166 non-null  object
 4   admittime             54166 non-null  object
 5   proc_time_from_admit  54166 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.5+ MB


In [24]:
pcd_map = pd.read_csv('./mimiciv/2.2/hosp/d_icd_procedures.csv.gz', compression='gzip')
pcd_map = pcd_map[pcd_map.icd_version==10]

proc_new = pd.merge(proc, pcd_map, on='icd_code', how='left')
proc_new = proc_new[['subject_id','hadm_id','icd_code','proc_time_from_admit','long_title']]
proc_new = proc_new[proc_new.hadm_id.isin(cohort_new.hadm_id)]
proc_new.sample(5)

Unnamed: 0,subject_id,hadm_id,icd_code,proc_time_from_admit,long_title
35934,16400300,23186972,02110Z9,0 days 10:06:00,"Bypass Coronary Artery, Two Arteries from Left..."
767,10148145,25168500,0W9930Z,0 days 01:50:00,Drainage of Right Pleural Cavity with Drainage...
8913,11584580,21132419,3E0G76Z,1 days 01:46:00,Introduction of Nutritional Substance into Upp...
24491,14342500,25798399,0BC18ZZ,-1 days +07:21:00,"Extirpation of Matter from Trachea, Via Natura..."
51260,19445384,25197891,0DB64Z3,0 days 00:00:00,"Excision of Stomach, Percutaneous Endoscopic A..."


In [25]:
proc_new.to_csv(save_dir + 'proc.csv')
proc_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42834 entries, 0 to 54165
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   subject_id            42834 non-null  int64 
 1   hadm_id               42834 non-null  int64 
 2   icd_code              42834 non-null  object
 3   proc_time_from_admit  42834 non-null  object
 4   long_title            42834 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


### Meds

In [26]:
meds = pd.read_csv(read_dir + 'preproc_med.csv.gz', compression='gzip')
meds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2575767 entries, 0 to 2575766
Data columns (total 8 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   subject_id              int64 
 1   hadm_id                 int64 
 2   starttime               object
 3   stoptime                object
 4   drug_name               object
 5   start_hours_from_admit  object
 6   stop_hours_from_admit   object
 7   dose_val_rx             object
dtypes: int64(2), object(6)
memory usage: 157.2+ MB


In [27]:
meds_new = meds.copy()
meds_new = meds_new[['subject_id','hadm_id','drug_name','start_hours_from_admit','stop_hours_from_admit','dose_val_rx']]
meds_new = meds_new[meds_new.hadm_id.isin(cohort_new.hadm_id)]
meds_new.sample(5)

Unnamed: 0,subject_id,hadm_id,drug_name,start_hours_from_admit,stop_hours_from_admit,dose_val_rx
920284,18570301,29951431,sodium chloride,3 days 14:25:00,4 days 13:25:00,500
325819,13125120,27625175,heparin sodium,0 days 03:49:00,7 days 13:49:00,5000
917925,18423421,26859165,sodium chloride,1 days 20:41:00,2 days 06:41:00,1000
1514691,13537167,25296167,"aluminum hydroxide, magnesium hydroxide, and s...",0 days 02:12:00,0 days 07:12:00,15-30
344753,15306238,26937019,heparin sodium,2 days 03:59:00,4 days 03:59:00,5000


In [28]:
meds_new.to_csv(save_dir + 'meds.csv')
meds_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2102428 entries, 0 to 2575766
Data columns (total 6 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   subject_id              int64 
 1   hadm_id                 int64 
 2   drug_name               object
 3   start_hours_from_admit  object
 4   stop_hours_from_admit   object
 5   dose_val_rx             object
dtypes: int64(2), object(4)
memory usage: 112.3+ MB
