## Basic  Preparation for MIMIC-III Dataset for Discharge Codes Longitudinal Predictions Task

### Main Preparation Steps

1. Load dataframes from the downloaded dataset from Physionet.
2. Select patients with at least two admissions (i.e. for sequential prediction task).
3. Map subjects to single ethnicity in admissions.
4. Export three tables to the cohort directory:
    1. `adm_df.csv.gz`: for admission information.
    2. `static_df.csv.gz`: for demographic information (DoB, ethnicity, gender).
    3. `dx_df.csv.gz`: for discharge codes information linked to each admission.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# Set the raw dataset directory (folder) absolute path.
mimic_dir = '/home/asem/GP/ehr-data/physionet.org/files/mimiciii/1.4'

# Set the target cohort directory.
cohort_dir = '/home/asem/GP/ehr-data/mimic3-cohort'
Path(cohort_dir).mkdir(parents=True, exist_ok=True)


In [3]:
adm_df = pd.read_csv(f'{mimic_dir}/ADMISSIONS.csv.gz')

In [4]:
adm_df.head()

Unnamed: 0,row_id,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,diagnosis,hospital_expire_flag,has_chartevents_data
0,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,NOT SPECIFIED,,ASIAN,,,NEWBORN,0,1
1,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,CATHOLIC,MARRIED,WHITE,2101-10-20 17:09:00,2101-10-20 19:24:00,HYPOTENSION,0,1
2,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,BUDDHIST,,ASIAN,,,NEWBORN,0,1
3,6,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,CATHOLIC,,WHITE,,,NEWBORN,0,1
4,7,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,CATHOLIC,,WHITE,,,NEWBORN,0,1


In [5]:
f'#subjects: {len(set(adm_df.SUBJECT_ID))}'

AttributeError: 'DataFrame' object has no attribute 'SUBJECT_ID'

In [None]:
patients_n_admission = adm_df.groupby('SUBJECT_ID').size()
patients_n_admission[patients_n_admission > 1].hist(bins=patients_n_admission.max())

In [None]:
# In GRAM paper, all patients with at least 2 visits were utilized (n=7537).
f'#subjects (n_admissions> 1): {len(patients_n_admission[patients_n_admission > 1])}'

#### Filter qualified subjects

In [None]:
qualified_subjects = patients_n_admission[patients_n_admission > 1].index
adm_df = adm_df[adm_df.SUBJECT_ID.isin(qualified_subjects)]

In [None]:
adm_df

In [None]:
def map_ethnicity(subj_df):
    
    def resolution(series):
        # Return the more detailed ethnicity (longer string)
        idx = series.apply(lambda s: len(s)).idxmax()
        return series.loc[idx]
    
    eth_df = subj_df['ETHNICITY']
    eth_df = eth_df[~eth_df.str.contains('UNKNOWN')]
    eth_df = eth_df[~eth_df.str.contains('UNABLE')]
    eth_df = eth_df[~eth_df.str.contains('DECLINED')]
    ethnicity_set = set(eth_df)
    
    if len(ethnicity_set) == 0:
        return 'UNKNOWN'
    if len(ethnicity_set) == 1:
        (ethnicity,) = ethnicity_set
        return ethnicity
    
    if eth_df.str.contains('WHITE').all():
        return resolution(eth_df)
    if eth_df.str.contains('ASIAN').all():
        return resolution(eth_df)
    if eth_df.str.contains('HISPANIC').all():
        return resolution(eth_df)
    if len(ethnicity_set) > 1:
        print(f'Unresolved (return "UNKNOWN"): {ethnicity_set}')
    
    return 'UNKNOWN'

subject_eth_df = adm_df.groupby('SUBJECT_ID').apply(map_ethnicity)

In [None]:
patients_df = pd.read_csv(f'{mimic_dir}/PATIENTS.csv.gz')
patients_df = patients_df[patients_df.SUBJECT_ID.isin(qualified_subjects)].reset_index(drop=True)
patients_df['ETHNICITY'] = patients_df.SUBJECT_ID.map(subject_eth_df)
patients_df = patients_df[['SUBJECT_ID', 'GENDER', 'DOB', 'ETHNICITY']]
patients_df.to_csv(f'{cohort_dir}/static_df.csv.gz', compression='gzip', index=False)

In [None]:
patients_df = pd.read_csv(f'{cohort_dir}/static_df.csv.gz')
patients_df.head()

In [None]:
adm_df

In [None]:
adm_df = adm_df[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']]
adm_df.to_csv(f'{cohort_dir}/adm_df.csv.gz', compression='gzip', index=False)


In [None]:
dx_df = pd.read_csv(f'{mimic_dir}/DIAGNOSES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
dx_df = dx_df[dx_df.ICD9_CODE.notnull()].reset_index(drop=True)
dx_df = dx_df[dx_df.SUBJECT_ID.isin(qualified_subjects)].reset_index(drop=True)
dx_df = dx_df[['HADM_ID', 'ICD9_CODE']]
dx_df.to_csv(f'{cohort_dir}/dx_df.csv.gz', compression='gzip', index=False)