In [1]:
""" 
Prepare MIMIC dataset to run deep patient on the dataset
Desired columns: Patient ID, gender, list of ICD codes, mortality
TODO: add more columns: e.g. number of days in ICU
"""

import pandas as pd
import sys
from datetime import datetime

In [2]:
"""
Read in ADMISSIONS.csv and construct patient ID to admission ID map
"""

pid_admids = {}

admission_file = open("./mimiciii/ADMISSIONS.csv", 'r')
admission_file.readline()
for line in admission_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    admId = int(tokens[2])
    if pid in pid_admids: pid_admids[pid].append(admId)
    else: pid_admids[pid] = [admId]
admission_file.close()


In [3]:
""" 
Read in PATIENTS.csv and construct (1) patient ID to gender map, and (2) patient ID to expire map
"""
pid_gender = {}
pid_expire = {}
patient_file = open("./mimiciii/PATIENTS.csv", 'r')
patient_file.readline()
for line in patient_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    gender = str(tokens[2])
    pid_gender[pid] = gender[1] # remove quotes
    expire = int(tokens[-1])
    pid_expire[pid] = expire

In [4]:
""" 
Consruct admission ID to ICD codes mapping
"""

def convert_to_icd9(dxStr):
	if dxStr.startswith('E'):
		if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
		else: return dxStr
	else:
		if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
		else: return dxStr

admid_codes = {}
diagnose_file = open("./mimiciii/DIAGNOSES_ICD.csv", 'r')
diagnose_file.readline()
for line in diagnose_file: # read ADMISSIONS.CSV in order
    tokens = line.strip().split(',')
    admid = int(tokens[2])
    dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1])

    if admid in admid_codes: 
        admid_codes[admid].append(dxStr)
    else: 
        admid_codes[admid] = [dxStr]

diagnose_file.close()
# print(admid_codes)


In [5]:
""" 
Construct patient ID to ICD codes mapping
"""

pid_codes = {}

for pid, admids in pid_admids.items():
    if pid not in pid_codes:
        pid_codes[pid] = []
    for admid in admids:
        pid_codes[pid].extend(admid_codes[admid])
# print(pid_codes)

In [6]:
"""
Convert str codes to int codes
We use 0 to denote NA code in later analysis
"""

types = {}
for pid, codes in pid_codes.items():
    new_codes = []
    for code in codes:
        if code not in types:
            types[code] = len(types) + 1
        new_codes.append(types[code])
    pid_codes[pid] = new_codes


In [7]:
""" 
Construct a dataframe to store all information including
- patient ID (index key)
- gender
- expire (mortality)
- list of ICD codes
"""

patient_agg_df = pd.DataFrame(columns=['pid','gender','expire','ICD codes'])

min_num_codes = float("inf")
max_num_codes = 0

for pid, codes in pid_codes.items():
    new_row = {"pid": pid, "gender": pid_gender[pid], "expire": pid_expire[pid], "ICD codes": pid_codes[pid]}
    min_num_codes = min(min_num_codes, len(pid_codes[pid]))
    max_num_codes = max(max_num_codes, len(pid_codes[pid]))
    patient_agg_df = pd.concat([patient_agg_df, pd.DataFrame([new_row])], ignore_index=True)
    
patient_agg_df.to_csv("./mimiciii/PATIENTS_AGG.csv", index=None, header=True)

In [8]:
print("min_num_codes is:", min_num_codes)
print("max_num_codes is:", max_num_codes)
print("number of ICD code is:", len(types))

min_num_codes is: 1
max_num_codes is: 540
number of ICD code is: 6985
