In [1]:
# This script processes MIMIC-III dataset and builds longitudinal diagnosis records for patients with at least two visits. 
# The output data are cPickled, and suitable for training Doctor AI or RETAIN. 
# The original scripts were written by Edward Choi (mp2893@gatech.edu). 
# The original scripts were tidied and corrected for bugs. pandas library is used for dataframe visualization. 
# Additional comments were added for better understanding. 
# Usage: Download the notebook as python script to the foler where MIMIC-III CSV files are located. Then execute the below command.
# python process_mimic.py MIMICIIIPROCESSED 

# Output files 
# `<output file>.pids`: List of unique Patient IDs. Used for intermediate processing 
# `<output file>.morts`: List of binary values indicating the mortality of each patient 
# `<output file>.dates`: List of List of Python datetime objects. The outer List is for each patient. The inner List is for each visit made by each patient 
# `<output file>.seqs`: List of List of List of integer diagnosis codes. The outer List is for each patient. The middle List contains visits made by each patient. The inner List contains the integer diagnosis codes that occurred in each visit
# `<output file>.types`: Python dictionary that maps string diagnosis codes to integer diagnosis codes.

### Read original csv files 
The datasets from MIMIC-III database used for this project are:<br /> 
    1. `ADMISSIONS.csv`<br /> 
    2. `DIAGNOSES_ICD.csv`<br /> 
    3. `PATIENTS.csv`<br /> 
    4. `PRESCRIPTIONS.csv`<br /> 
    5. `LABEVENTS.csv`<br /> 
    6. `PROCEDURES_ICD.csv`<br /> 

In [2]:
import sys
import pickle
from datetime import datetime
import math
import os
from dateutil.relativedelta import relativedelta
import pandas as pd

DATA_PATH = "E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC data/"
OUT_PATH = "E:/CS_Master_Degree_UIUC/CS598_DeepLearning_for_Health_Data/Project/paper290/MIMIC_Processed/"

def convert_to_icd9(dxStr):
    '''
    This is a more granular definition of ICD9 codes that includes sub-classes and leads to larger vocabulary size
    '''
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr

def convert_to_3digit_icd9(dxStr):
    '''
    This is a less granular definition of ICD9 codes that includes major classes and leads to smaller vocabulary size
    '''
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3]
        else: return dxStr

admissionFile = os.path.join(DATA_PATH, 'ADMISSIONS.csv')
diagnosisFile = os.path.join(DATA_PATH, 'DIAGNOSES_ICD.csv')
patientsFile = os.path.join(DATA_PATH, 'PATIENTS.csv')
medFile = os.path.join(DATA_PATH, 'PRESCRIPTIONS.csv')
labFile = os.path.join(DATA_PATH, 'LABEVENTS.csv')
procedureFile = os.path.join(DATA_PATH, 'PROCEDURES_ICD.csv')
# outFile = os.path.join(OUT_PATH, sys.argv[1])

### Map patient or admission IDs to other information  

1. PID map to demographic information 

In [3]:
patients = pd.read_csv(DATA_PATH + 'PATIENTS.csv')
patients.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0


In [4]:
print('Collecting mortality information')
pidDodMap = {}
pidDobMap = {}
pidGenderMap = {}
infd = open(patientsFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    dod_hosp = tokens[5]
    gender = tokens[2]
    dob = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    pidGenderMap[pid] = gender
    pidDobMap[pid] = dob
    if len(dod_hosp) > 0:
        pidDodMap[pid] = 1 #dead
    else:
        pidDodMap[pid] = 0 #not dead
infd.close()

Collecting mortality information


2. PID map to Admission, Admission map to demographic information 

In [5]:
admissions = pd.read_csv(DATA_PATH + 'ADMISSIONS.csv')
admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [6]:
print('Building pid-admission mapping, admission-date mapping')
pidAdmMap = {}
admDateMap = {}
admAgeMap = {}
infd = open(admissionFile, 'r')
infd.readline()
for line in infd:
        tokens = line.strip().split(',')
        pid = int(tokens[1])
        admId = int(tokens[2])
        admTime = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
        admDateMap[admId] = admTime
        admAgeMap[admId] = relativedelta(admTime, pidDobMap[pid]).years
        if pid in pidAdmMap: pidAdmMap[pid].append(admId)
        else: pidAdmMap[pid] = [admId]
infd.close()

Building pid-admission mapping, admission-date mapping


3. Admission map to procedure ICD codes

In [7]:
procedures = pd.read_csv(DATA_PATH + 'PROCEDURES_ICD.csv')
procedures.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,944,62641,154460,3,3404
1,945,2592,130856,1,9671
2,946,2592,130856,2,3893
3,947,55357,119355,1,9672
4,948,55357,119355,2,331


In [8]:
print('Building admission-procedure mapping')
admProcMap = {}
infd = open(procedureFile, 'r')
infd.readline()
for line in infd:
        tokens = line.strip().split(',')
        admId = int(tokens[2])
        proc = 'P_' + convert_to_3digit_icd9(tokens[4][1:-1])

        if admId in admProcMap:
            admProcMap[admId].append(proc)
        else:
            admProcMap[admId] = [proc]
infd.close()

Building admission-procedure mapping


4. Admission map to medication

In [9]:
meds = pd.read_csv(DATA_PATH + 'PRESCRIPTIONS.csv', low_memory=False)
meds.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,2214776,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Tacrolimus,Tacrolimus,Tacrolimus,TACR1,21796.0,469061711.0,1mg Capsule,2,mg,2,CAP,PO
1,2214775,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Warfarin,Warfarin,Warfarin,WARF5,6562.0,56017275.0,5mg Tablet,5,mg,1,TAB,PO
2,2215524,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Heparin Sodium,,,HEPAPREMIX,6522.0,338055002.0,"25,000 unit Premix Bag",25000,UNIT,1,BAG,IV
3,2216265,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,BASE,D5W,,,HEPBASE,,0.0,HEPARIN BASE,250,ml,250,ml,IV
4,2214773,6,107064,,2175-06-11 00:00:00,2175-06-12 00:00:00,MAIN,Furosemide,Furosemide,Furosemide,FURO20,8208.0,54829725.0,20mg Tablet,20,mg,1,TAB,PO


In [10]:
print('Building admission-med mapping')
admMedMap = {}
infd = open(medFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    med = tokens[7]

    if admId in admMedMap:
        admMedMap[admId].append(med)
    else:
        admMedMap[admId] = [med]
infd.close()

Building admission-med mapping


5. Admission map to lab component

In [11]:
labs = pd.read_csv(DATA_PATH + 'LABEVENTS.csv')
labs.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,281,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,282,3,,50800,2101-10-12 18:17:00,ART,,,
2,283,3,,50802,2101-10-12 18:17:00,-1,-1.0,mEq/L,
3,284,3,,50804,2101-10-12 18:17:00,22,22.0,mEq/L,
4,285,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal


In [12]:
print('Building admission-lab mapping') #3,8
admAllLabMap = {}
admAbnormalLabMap = {}
infd = open(labFile, 'r')
infd.readline()
no_admission = 0
total = 0
for line in infd:
    tokens = line.strip().split(',')
    if tokens[2] == '':
        no_admission += 1
        total += 1
        continue
    total += 1
    admId = int(tokens[2])
    abnormal_flag = tokens[8]
    lab = tokens[3]
    
    if admId in admAllLabMap:
        admAllLabMap[admId].append(lab)
    else:
        admAllLabMap[admId] = [lab]

    if abnormal_flag:
        if admId in admAbnormalLabMap:
            admAbnormalLabMap[admId].append(lab)
        else:
            admAbnormalLabMap[admId] = [lab]

infd.close()
print("Labs without admissions =", no_admission, "out of", total)

Building admission-lab mapping
Labs without admissions = 5609021 out of 27854055


6. Admission map to diagnoses ICD codes

In [13]:
diags = pd.read_csv(DATA_PATH + 'DIAGNOSES_ICD.csv')
diags.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [14]:
print('Building admission-dxList mapping')
admDxMap = {}
admDxMap_3digit = {}
infd = open(diagnosisFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1]) ############## Uncomment this line and comment the line below, if you want to use the entire ICD9 digits.
    dxStr_3digit = 'D_' + convert_to_3digit_icd9(tokens[4][1:-1])

    if admId in admDxMap: 
        admDxMap[admId].append(dxStr)
    else: 
        admDxMap[admId] = [dxStr]

    if admId in admDxMap_3digit: 
        admDxMap_3digit[admId].append(dxStr_3digit)
    else: 
        admDxMap_3digit[admId] = [dxStr_3digit]
infd.close()

Building admission-dxList mapping


### Build nested list of sequences

Build 6 nested lists. The outer list is the patient with at least 2 visits, the inner list is visit with one type of the following features: <br />
1. visit list of complete version of diagnoses ICD codes: pidSeqMap.<br /> 
2. visit list of short version of diagnoses ICD codes: pidSeqMap_3digit. <br /> 
3. visit list of medication: pidSeqMap_meds. <br /> 
4. visit list of all types of lab components: pidSeqMap_allLabs. <br /> 
5. visit list of abnormal type of lab components: pidSeqMap_abnormalLabs. <br /> 
6. visit list of procedure ICD codes (it has only short version): pidSeqMAp_procs. <br /> 

In [15]:
def getList(dictionary, key):
    if key in dictionary:
        return dictionary[key]
    else:
        return []

print('Building pid-sortedVisits mapping')
pidSeqMap = {}
pidSeqMap_3digit = {}
pidSeqMap_meds = {}
pidSeqMap_allLabs = {}
pidSeqMap_abnormalLabs = {}
# pidSeqMap_assessments = {}
pidSeqMap_procs = {}

single_admission_count = 0
multiple_admission_count = 0

for pid, admIdList in pidAdmMap.items():
    if len(admIdList) < 1:
        continue 

    if len(admIdList) == 1:
        single_admission_count += 1
        continue
    
    if len(admIdList) > 1:
        multiple_admission_count += 1
        
    sortedList = sorted([(admDateMap[admId], admDxMap[admId], admAgeMap[admId]) for admId in admIdList])
    pidSeqMap[pid] = sortedList

    sortedList_3digit = sorted([(admDateMap[admId], admDxMap_3digit[admId], admAgeMap[admId]) for admId in admIdList])
    pidSeqMap_3digit[pid] = sortedList_3digit

    sortedList_meds = sorted([(admDateMap[admId], getList(admMedMap, admId), admAgeMap[admId]) for admId in admIdList])
    pidSeqMap_meds[pid] = sortedList_meds

    sortedList_allLabs = sorted([(admDateMap[admId], getList(admAllLabMap, admId), admAgeMap[admId]) for admId in admIdList])
    pidSeqMap_allLabs[pid] = sortedList_allLabs

    sortedList_abnormalLabs = sorted([(admDateMap[admId], getList(admAbnormalLabMap, admId), admAgeMap[admId]) for admId in admIdList])
    pidSeqMap_abnormalLabs[pid] = sortedList_abnormalLabs

    sortedList_procs = sorted([(admDateMap[admId], getList(admProcMap, admId), admAgeMap[admId]) for admId in admIdList])
    pidSeqMap_procs[pid] = sortedList_procs

print('Single admission count: ', single_admission_count)
print('Multiple admission count: ', multiple_admission_count)
print(len(pidSeqMap))

Building pid-sortedVisits mapping
Single admission count:  38983
Multiple admission count:  7537
7537


### Conversion of Seq lists and Build the Type map 

1. Convert the string codes built from above section to integer code list. <br /> 
2. Build dictionary that maps string codes to integer codes. <br />

In [16]:
print('Building pids, dates, mortality_labels, DiagICDSeqs')
pids = []
genders = []
dates = []
ages = []
seqs = []
morts = []

for pid, visits in pidSeqMap.items():
    pids.append(pid)
    genders.append(pidGenderMap[pid])
    morts.append(pidDodMap[pid])
    seq = []
    date = []
    age = []
    for visit in visits:
        date.append(visit[0]) #admDate
        seq.append(visit[1]) #admDx
        age.append(visit[2]) #admAge
    dates.append(date)
    seqs.append(seq)
    ages.append(age)

print('Converting strSeqs to intSeqs, and making types for full ICD9 code')
types = {}
newSeqs = []
for patient in seqs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in set(visit):
            if code in types:
                newVisit.append((types[code]))
            else:
                types[code] = len(types)
                newVisit.append((types[code]))
        newPatient.append(newVisit)
    newSeqs.append(newPatient)

print(len(types))

Building pids, dates, mortality_labels, DiagICDSeqs
Converting strSeqs to intSeqs, and making types for full ICD9 code
4894


In [17]:
print('Building pids, dates, DiagICDSeqs for 3digit ICD9 code')
seqs_3digit = []
for pid, visits in pidSeqMap_3digit.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_3digit.append(seq)

print('Converting strSeqs to intSeqs, and making types for 3digit ICD9 code')
types_3digit = {}
newSeqs_3digit = []
for patient in seqs_3digit:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in set(visit):
            if code in types_3digit:
                newVisit.append((types_3digit[code]))
            else:
                types_3digit[code] = len(types_3digit)
                newVisit.append((types_3digit[code]))
        newPatient.append(newVisit)
    newSeqs_3digit.append(newPatient)

print(len(types_3digit))

Building pids, dates, DiagICDSeqs for 3digit ICD9 code
Converting strSeqs to intSeqs, and making types for 3digit ICD9 code
942


In [18]:
print('Building pids, dates, ProcICDSeqs')
seqs_procs = []
for pid, visits in pidSeqMap_procs.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_procs.append(seq)

print('Converting strSeqs to intSeqs, and making types for procedures')
types_procs = {}
newSeqs_procs = []
for patient in seqs_procs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in set(visit):
            if code in types_procs:
                newVisit.append((types_procs[code]))
            else:
                types_procs[code] = len(types_procs)
                newVisit.append((types_procs[code]))
        newPatient.append(newVisit)
    newSeqs_procs.append(newPatient)

print(len(types_procs))

Building pids, dates, ProcICDSeqs
Converting strSeqs to intSeqs, and making types for procedures
562


In [19]:
print('Building pids, dates, strSeqs for meds')
seqs_meds = []
for pid, visits in pidSeqMap_meds.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_meds.append(seq)

print('Converting strSeqs to intSeqs, and making types for meds')
types_meds = {}
newSeqs_meds = []
for patient in seqs_meds:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in visit:
            if code in types_meds:
                newVisit.append((types_meds[code]))
            else:
                types_meds[code] = len(types_meds)
                newVisit.append((types_meds[code]))
        newPatient.append(newVisit)
    newSeqs_meds.append(newPatient)

print(len(types_meds)) #3202 meds

Building pids, dates, strSeqs for meds
Converting strSeqs to intSeqs, and making types for meds
3202


In [20]:
print('Building pids, dates, strSeqs for all labs')
seqs_alllabs = []
for pid, visits in pidSeqMap_allLabs.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_alllabs.append(seq)

print('Converting strSeqs to intSeqs, and making types for all labs')
types_alllabs = {}
newSeqs_alllabs = []
for patient in seqs_alllabs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in visit:
            if code in types_alllabs:
                newVisit.append(types_alllabs[code])
            else:
                types_alllabs[code] = len(types_alllabs)
                newVisit.append(types_alllabs[code])
        newPatient.append(newVisit)
    newSeqs_alllabs.append(newPatient)

print(len(types_alllabs))

Building pids, dates, strSeqs for all labs
Converting strSeqs to intSeqs, and making types for all labs
681


In [21]:
print('Building pids, dates, strSeqs for abnormal labs')
seqs_abnormallabs = []
for pid, visits in pidSeqMap_abnormalLabs.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_abnormallabs.append(seq)

print('Converting strSeqs to intSeqs, and making types for abnormal labs')
types_abnormallabs = {}
newSeqs_abnormallabs = []
for patient in seqs_abnormallabs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in visit:
            if code in types_abnormallabs:
                newVisit.append((types_abnormallabs[code]))
            else:
                types_abnormallabs[code] = len(types_abnormallabs)
                newVisit.append((types_abnormallabs[code]))
        newPatient.append(newVisit)
    newSeqs_abnormallabs.append(newPatient)

print(len(types_abnormallabs))

Building pids, dates, strSeqs for abnormal labs
Converting strSeqs to intSeqs, and making types for abnormal labs
284


### Save all seq lists and type lists built from previous sections 

In [22]:
outFile = OUT_PATH + 'MIMICIIIPROCESSED'

pickle.dump(pids, open(outFile+'.pids', 'wb'),-1)
pickle.dump(genders, open(outFile+'.genders', 'wb'),-1)
pickle.dump(dates, open(outFile+'.dates', 'wb'),-1)
pickle.dump(ages, open(outFile+'.ages', 'wb'),-1)
pickle.dump(morts, open(outFile+'.morts', 'wb'), -1) #Labels
pickle.dump(newSeqs, open(outFile+'.seqs', 'wb'),-1)
pickle.dump(types, open(outFile+'.types', 'wb'),-1)
pickle.dump(newSeqs_3digit, open(outFile+'.3digitICD9.seqs', 'wb'),-1)
pickle.dump(types_3digit, open(outFile+'.3digitICD9.types', 'wb'),-1)
pickle.dump(newSeqs_meds, open(outFile+'.meds.seqs', 'wb'),-1)
pickle.dump(types_meds, open(outFile+'.meds.types', 'wb'),-1)
pickle.dump(newSeqs_alllabs, open(outFile+'.alllabs.seqs', 'wb'),-1)
pickle.dump(types_alllabs, open(outFile+'.alllabs.types', 'wb'),-1)
pickle.dump(newSeqs_abnormallabs, open(outFile+'.abnlabs.seqs', 'wb'),-1)
pickle.dump(types_abnormallabs, open(outFile+'.abnlabs.types', 'wb'),-1)
pickle.dump(newSeqs_procs, open(outFile+'.procs.seqs', 'wb'),-1)
pickle.dump(types_procs, open(outFile+'.procs.types', 'wb'),-1)