In [1]:
""" 
Prepare MIMIC dataset to run deep patient on the dataset
Desired columns: Patient ID, gender, list of ICD codes, mortality
TODO: add more columns: e.g. number of days in ICU
"""

import pandas as pd
import sys
from datetime import datetime

In [2]:
"""
Read in ADMISSIONS.csv and construct (1) patient ID to admission ID map, and \
    (2) admission ID to duration in hospital map (in seconds)
"""

pid_admids = {}
admid_duration = {}

admission_file = open("./mimiciii/ADMISSIONS.csv", 'r')
admission_file.readline()
for line in admission_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    admid = int(tokens[2])
    if pid in pid_admids: pid_admids[pid].append(admid)
    else: pid_admids[pid] = [admid]

    admit_time = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    discharge_time = datetime.strptime(tokens[4], '%Y-%m-%d %H:%M:%S')
    admid_duration[admid] = (discharge_time-admit_time).total_seconds()

admission_file.close()


In [3]:
""" 
Read in PATIENTS.csv and construct admission ID to gender map
"""
admid_gender = {}
patient_file = open("./mimiciii/PATIENTS.csv", 'r')
patient_file.readline()
for line in patient_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    gender = str(tokens[2])
    admids = pid_admids[pid]
    for admid in admids:
        admid_gender[admid] = gender[1]

patient_file.close()

In [4]:
""" 
Consruct admission ID to ICD codes mapping
"""

def convert_to_icd9(dxStr):
	if dxStr.startswith('E'):
		if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
		else: return dxStr
	else:
		if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
		else: return dxStr

admid_codes = {}
diagnose_file = open("./mimiciii/DIAGNOSES_ICD.csv", 'r')
diagnose_file.readline()
for line in diagnose_file: # read ADMISSIONS.CSV in order
    tokens = line.strip().split(',')
    admid = int(tokens[2])
    dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1])

    if admid in admid_codes: 
        admid_codes[admid].append(dxStr)
    else: 
        admid_codes[admid] = [dxStr]

diagnose_file.close()
# print(admid_codes)


In [5]:
"""
Convert str codes to int codes
We use 0 to denote NA code in later analysis
"""

types = {}
for admid, codes in admid_codes.items():
    new_codes = []
    for code in codes:
        if code not in types:
            types[code] = len(types) + 1
        new_codes.append(types[code])
    admid_codes[admid] = new_codes


In [6]:
""" 
Construct a dataframe to store all information including
- patient ID (index key)
- gender
- expire (mortality)
- list of ICD codes
"""

admission_agg_df = pd.DataFrame(columns=['gender','ICD codes','duration'])

for admid, codes in admid_codes.items():
    new_row = {"gender": admid_gender[admid], "duration": admid_duration[admid], "ICD codes": codes}
    admission_agg_df = pd.concat([admission_agg_df, pd.DataFrame([new_row])], ignore_index=True)
    
admission_agg_df.to_csv("../mimiciii/ADMISSIONS_AGG.csv", index=None, header=True)