In [292]:
import pandas as pd
import numpy as np
from itertools import chain
import pickle

In [2]:
input_path = "E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC data/"
output_path = "E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/Processed_data/"

## ADMISSIONS.csv
1. Explore data and check what columns it has.
2. Find out all patients that have at least 2 admissions (at least 2 unique "HADM_ID"). Per Appendix1 of paper,  
   the dataset contains 7,537 patients with 2 or more encounters that they used in their CLOUT model.
3. Some demographic invetistigaiton such as distribution of ethnicity among the above subset of patient

In [15]:
# Read original dataset and explore the columns
admissions = pd.read_csv(input_path + "ADMISSIONS.csv")
col_names = admissions.columns
print(col_names)


# Findout the patient IDs ("SUBJECT_ID") that have at least 2 encounters 
AC = admissions.groupby(['SUBJECT_ID']).HADM_ID.nunique()
subset_id = AC.loc[AC > 1].reset_index().SUBJECT_ID	

# Get the subset of original dataset
admissions_subset = admissions.loc[admissions.SUBJECT_ID.isin(subset_id)].sort_values(by=['SUBJECT_ID','ROW_ID'])
admissions_subset = admissions_subset[['SUBJECT_ID', 'HADM_ID','ETHNICITY','HOSPITAL_EXPIRE_FLAG']]

# Verify if we get the right number of patients
patient_number = len(admissions_subset.SUBJECT_ID.unique())
print("The number of patients that have at least two encounters is: ",patient_number)
print("The number of records in admissions_subset is: ", admissions_subset.shape[0])
admissions_subset.head()

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'],
      dtype='object')
The number of patients that have at least two encounters is:  7537
The number of records in admissions_subset is:  19993


Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,HOSPITAL_EXPIRE_FLAG
224,17,194023,WHITE,0
225,17,161087,WHITE,0
229,21,109451,WHITE,0
230,21,111970,WHITE,1
1,23,152223,WHITE,0


In [4]:
### Explore the ethnicity groups in the dataset
patient_ethnicity = pd.DataFrame(admissions_subset.groupby('ETHNICITY').SUBJECT_ID.unique()).reset_index()
patient_ethnicity['ETHNICITY_COUNT'] = patient_ethnicity.apply(lambda x: len(x['SUBJECT_ID']), axis = 1)
print("All ethnicity groups in original dataset: ", patient_ethnicity.ETHNICITY.values)

# Simplify the groups per paper into five basic groups

E_group = ['WHITE','BLACK','ASIAN','HISPANIC']

def Simple_Ethnicity(req, E_group):
    if req ==  'PORTUGUESE':
        return('WHITE')
    for e in E_group:
        if e in req:
            return(e)
    return('OTHER')

patient_ethnicity['ETHNICITY'] = patient_ethnicity.apply(lambda x: Simple_Ethnicity(x['ETHNICITY'], E_group), axis = 1)

patient_ethnicity.head()

All ethnicity groups in original dataset:  ['AMERICAN INDIAN/ALASKA NATIVE'
 'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE' 'ASIAN'
 'ASIAN - ASIAN INDIAN' 'ASIAN - CAMBODIAN' 'ASIAN - CHINESE'
 'ASIAN - FILIPINO' 'ASIAN - KOREAN' 'ASIAN - OTHER' 'ASIAN - THAI'
 'ASIAN - VIETNAMESE' 'BLACK/AFRICAN' 'BLACK/AFRICAN AMERICAN'
 'BLACK/CAPE VERDEAN' 'BLACK/HAITIAN' 'CARIBBEAN ISLAND'
 'HISPANIC OR LATINO' 'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)'
 'HISPANIC/LATINO - COLOMBIAN' 'HISPANIC/LATINO - CUBAN'
 'HISPANIC/LATINO - DOMINICAN' 'HISPANIC/LATINO - GUATEMALAN'
 'HISPANIC/LATINO - HONDURAN' 'HISPANIC/LATINO - MEXICAN'
 'HISPANIC/LATINO - PUERTO RICAN' 'HISPANIC/LATINO - SALVADORAN'
 'MIDDLE EASTERN' 'MULTI RACE ETHNICITY'
 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' 'OTHER'
 'PATIENT DECLINED TO ANSWER' 'PORTUGUESE' 'UNABLE TO OBTAIN'
 'UNKNOWN/NOT SPECIFIED' 'WHITE' 'WHITE - BRAZILIAN'
 'WHITE - EASTERN EUROPEAN' 'WHITE - OTHER EUROPEAN' 'WHITE - RUSSIAN']


Unnamed: 0,ETHNICITY,SUBJECT_ID,ETHNICITY_COUNT
0,OTHER,"[5782, 14667, 20082, 31755, 40000, 43501]",6
1,OTHER,[42135],1
2,ASIAN,"[94, 191, 203, 299, 605, 907, 986, 1286, 1292,...",163
3,ASIAN,"[2666, 5962, 11043, 28278, 28860, 30650, 31120...",11
4,ASIAN,"[4334, 11171, 40474, 77471]",4


In [5]:
# Summarize the ethnicity group distribution among the patients and compare to Table-4 of Appendix1 of the paper
Ethnicity_table = patient_ethnicity.groupby('ETHNICITY').agg({'ETHNICITY_COUNT':sum})
Ethnicity_table['ETHNICITY_PERCENTAGE'] = np.round(Ethnicity_table['ETHNICITY_COUNT']/Ethnicity_table['ETHNICITY_COUNT'].sum()*100,1)
Ethnicity_table

Unnamed: 0_level_0,ETHNICITY_COUNT,ETHNICITY_PERCENTAGE
ETHNICITY,Unnamed: 1_level_1,Unnamed: 2_level_1
ASIAN,235,3.0
BLACK,885,11.3
HISPANIC,307,3.9
OTHER,665,8.5
WHITE,5736,73.3


## ICD codes
1. datasets include: DIGANOSES_ICD.csv and its dictionary dataset D_DIAGNOSES_ICD.csv.  D_PROCEDURES_ICD.csv.

In [219]:
diag_icd = pd.read_csv(input_path + 'DIAGNOSES_ICD.csv').drop('ROW_ID', axis = 1)
diag_icd_dict = pd.read_csv(input_path + 'D_ICD_DIAGNOSES.csv').drop('ROW_ID', axis = 1)
prod_icd = pd.read_csv(input_path + 'PROCEDURES_ICD.csv').drop('ROW_ID', axis = 1)
prod_icd_dict = pd.read_csv(input_path + 'D_ICD_PROCEDURES.csv').drop('ROW_ID', axis = 1)

In [220]:
diag_icd = diag_icd.merge(diag_icd_dict, on = 'ICD9_CODE', how='left')
prod_icd = prod_icd.merge(prod_icd_dict, on = 'ICD9_CODE', how='left')

In [221]:
print(diag_icd.columns.values)
print(prod_icd.columns.values)

['SUBJECT_ID' 'HADM_ID' 'SEQ_NUM' 'ICD9_CODE' 'SHORT_TITLE' 'LONG_TITLE']
['SUBJECT_ID' 'HADM_ID' 'SEQ_NUM' 'ICD9_CODE' 'SHORT_TITLE' 'LONG_TITLE']


In [252]:
ICD = diag_icd.loc[diag_icd.SUBJECT_ID.isin(subset_id)]
admissions_subset1 = admissions_subset.merge(ICD, on  = ['SUBJECT_ID','HADM_ID'], how='inner').fillna(-1).drop('ETHNICITY', axis=1)
print('The volcabulary size of Diagnoses ICD9 codes is: ',admissions_subset1.ICD9_CODE.nunique())
admissions_subset1.head(10)

The volcabulary size of Diagnoses ICD9 codes is:  4894


Unnamed: 0,SUBJECT_ID,HADM_ID,HOSPITAL_EXPIRE_FLAG,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,17,194023,0,1.0,7455,Secundum atrial sept def,Ostium secundum type atrial septal defect
1,17,194023,0,2.0,45829,Iatrogenc hypotnsion NEC,Other iatrogenic hypotension
2,17,194023,0,3.0,V1259,Hx-circulatory dis NEC,Personal history of other diseases of circulat...
3,17,194023,0,4.0,2724,Hyperlipidemia NEC/NOS,Other and unspecified hyperlipidemia
4,17,161087,0,1.0,4239,Pericardial disease NOS,Unspecified disease of pericardium
5,17,161087,0,2.0,5119,Pleural effusion NOS,Unspecified pleural effusion
6,17,161087,0,3.0,78551,Cardiogenic shock,Cardiogenic shock
7,17,161087,0,4.0,4589,Hypotension NOS,"Hypotension, unspecified"
8,17,161087,0,5.0,311,Depressive disorder NEC,"Depressive disorder, not elsewhere classified"
9,17,161087,0,6.0,7220,Cervical disc displacmnt,Displacement of cervical intervertebral disc w...


In [287]:
subjects = admissions_subset1.SUBJECT_ID.unique()
ICDs = []
for subject in subjects:
    temp1 = admissions_subset1.loc[admissions_subset1.SUBJECT_ID == subject]
    encounters = temp1.HADM_ID.unique()
    codes = []
    for e in encounters:
        temp2 = temp1.loc[temp1.HADM_ID == e]
        codes.append(temp2.ICD9_CODE.tolist())
    ICDs.append(codes)

In [293]:
picklefile = "ICD_seq.pkl"
with open(output_path + picklefile, "wb") as pkl_wb_obj:
    pickle.dump(ICDs, pkl_wb_obj)

# #Check if the ICD_seq was saved properly
# with open(output_path + picklefile, "rb") as pkl_rb_obj:
#     seqs = pickle.load(pkl_rb_obj)
# print(seqs)

[[['7455', '45829', 'V1259', '2724'], ['4239', '5119', '78551', '4589', '311', '7220', '71946', '2724']], [['41071', '78551', '5781', '5849', '40391', '4280', '4592', '5070', '42731', '4271', '41401', '25000', '28521', '1122', '2720', '2749', 'V1046', '43889'], ['0388', '78552', '40391', '42731', '70709', '5119', '6823', '99859', '00845', '5720', '99592', 'V0980', '25000', '2859', '43889', '2749', '41401', '185', '4439', '2449', 'E8788']], [['41401', '4111', '4241', 'V4582', '2724', '4019', '60000', '3899'], ['2252', '3485', '78039', '4241', '4019', '2720', '2724', 'V4581', 'V4579', 'V1582']], [['41071', '4280', '4254', '42731', '9971', '4260', '41401', 'E8790'], ['42789', '42822', '4263', '41401', 'V5861', '4280', '2449', '3659']], [['41401', '4111', '496', '4019', '3051', '53081', '60000', 'V1051', '5968'], ['99831', '99811', '41511', '4538', '99672', '496', '41401', 'V4581', '4019', '60000', '53081', 'V1051'], ['55321', '41511', '5185', '486', '99739', '5180', '41400', '496', '4019'

## Medication
1. Dataset is 'PRESCRIPTIONS.csv' and the column 'DRUG' is the feature for model building

In [294]:
prescriptions = pd.read_csv(input_path + 'PRESCRIPTIONS.csv', low_memory=False).drop(['ROW_ID'],axis = 1)
print(prescriptions.columns.values)

prescriptions.head()

['SUBJECT_ID' 'HADM_ID' 'ICUSTAY_ID' 'STARTDATE' 'ENDDATE' 'DRUG_TYPE'
 'DRUG' 'DRUG_NAME_POE' 'DRUG_NAME_GENERIC' 'FORMULARY_DRUG_CD' 'GSN'
 'NDC' 'PROD_STRENGTH' 'DOSE_VAL_RX' 'DOSE_UNIT_RX' 'FORM_VAL_DISP'
 'FORM_UNIT_DISP' 'ROUTE']


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,21796.0,469061711.0,1mg Capsule,2,mg,2,CAP,PO
1,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Warfarin,Warfarin,Warfarin,WARF5,6562.0,56017275.0,5mg Tablet,5,mg,1,TAB,PO
2,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Heparin Sodium,,,HEPAPREMIX,6522.0,338055002.0,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,BASE,D5W,,,HEPBASE,,0.0,HEPARIN BASE,250,ml,250,ml,IV
4,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Furosemide,Furosemide,Furosemide,FURO20,8208.0,54829725.0,20mg Tablet,20,mg,1,TAB,PO


In [295]:
medications = prescriptions[['SUBJECT_ID', 'HADM_ID','DRUG']]
medications.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,DRUG
0,6,107064,Tacrolimus
1,6,107064,Warfarin
2,6,107064,Heparin Sodium
3,6,107064,D5W
4,6,107064,Furosemide


In [297]:
admissions_subset2 = admissions_subset.merge(medications, on = ['SUBJECT_ID','HADM_ID'], how = 'inner').fillna(-1).drop('ETHNICITY', axis=1)
print('The volcabulary size of Medications is: ',admissions_subset2.DRUG.nunique())
admissions_subset2.head(10)
# admissions_subset3.SUBJECT_ID.nunique() #Check if still have 7537 patients

The volcabulary size of Medications is:  3202


Unnamed: 0,SUBJECT_ID,HADM_ID,HOSPITAL_EXPIRE_FLAG,DRUG
0,17,194023,0,Sucralfate
1,17,194023,0,Ketorolac
2,17,194023,0,LR
3,17,194023,0,Morphine Sulfate
4,17,194023,0,Metoclopramide
5,17,194023,0,Oxycodone-Acetaminophen
6,17,194023,0,Acetaminophen
7,17,194023,0,Acetaminophen
8,17,194023,0,Aspirin EC
9,17,194023,0,Aspirin


In [299]:
subjects = admissions_subset2.SUBJECT_ID.unique()
Meds = []
for subject in subjects:
    temp1 = admissions_subset2.loc[admissions_subset2.SUBJECT_ID == subject]
    encounters = temp1.HADM_ID.unique()
    codes = []
    for e in encounters:
        temp2 = temp1.loc[temp1.HADM_ID == e]
        codes.append(temp2.DRUG.tolist())
    Meds.append(codes)

In [301]:
picklefile = "Med_seq.pkl"
with open(output_path + picklefile, "wb") as pkl_wb_obj:
    pickle.dump(Meds, pkl_wb_obj)

# #Check if the Med_seq was saved properly
# with open(output_path + picklefile, "rb") as pkl_rb_obj:
#     seqs = pickle.load(pkl_rb_obj)
# print(seqs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Lab
1. Datasets include LABEVENTS.csv and its dictionary dataset D_LABEVNETS.csv.

In [303]:
labevents = pd.read_csv(input_path + 'LABEVENTS.csv').drop('ROW_ID',axis = 1)
d_labitems = pd.read_csv(input_path + 'D_LABITEMS.csv').drop('ROW_ID',axis = 1)
print(labevents.columns.values)
print(d_labitems.columns.values)

['SUBJECT_ID' 'HADM_ID' 'ITEMID' 'CHARTTIME' 'VALUE' 'VALUENUM' 'VALUEUOM'
 'FLAG']
['ITEMID' 'LABEL' 'FLUID' 'CATEGORY' 'LOINC_CODE']


In [305]:
labevents = labevents.merge(d_labitems, on = 'ITEMID', how = 'inner')[['SUBJECT_ID','HADM_ID','ITEMID','LABEL']]

In [306]:
admissions_subset3 = admissions_subset.merge(labevents, on = ['SUBJECT_ID','HADM_ID'], how = 'left').fillna(-1).drop('ETHNICITY', axis=1)

print('The volcabulary size of Labs is: ',admissions_subset3.LABEL.nunique())
admissions_subset3.head(10)

The volcabulary size of Labs is:  546


Unnamed: 0,SUBJECT_ID,HADM_ID,HOSPITAL_EXPIRE_FLAG,ITEMID,LABEL
0,17,194023,0,50820.0,pH
1,17,194023,0,50820.0,pH
2,17,194023,0,50820.0,pH
3,17,194023,0,50820.0,pH
4,17,194023,0,50820.0,pH
5,17,194023,0,50820.0,pH
6,17,194023,0,50820.0,pH
7,17,194023,0,50820.0,pH
8,17,194023,0,50820.0,pH
9,17,194023,0,50820.0,pH


In [307]:
subjects = admissions_subset3.SUBJECT_ID.unique()
Labs = []
for subject in subjects:
    temp1 = admissions_subset3.loc[admissions_subset3.SUBJECT_ID == subject]
    encounters = temp1.HADM_ID.unique()
    codes = []
    for e in encounters:
        temp2 = temp1.loc[temp1.HADM_ID == e]
        codes.append(temp2.LABEL.tolist())
    Labs.append(codes)

In [308]:
picklefile = "Lab_seq.pkl"
with open(output_path + picklefile, "wb") as pkl_wb_obj:
    pickle.dump(Labs, pkl_wb_obj)

# #Check if the Med_seq was saved properly
# with open(output_path + picklefile, "rb") as pkl_rb_obj:
#     seqs = pickle.load(pkl_rb_obj)
# print(seqs)