In [7]:
""" 
Prepare MIMIC dataset to run deep patient on the dataset
Desired columns: Patient ID, gender, list of ICD codes, mortality
TODO: add more response columns: e.g. number of days in ICU
"""

import collections
import pandas as pd
import sys
from datetime import datetime
import matplotlib.pyplot as plt
import json

In [8]:
""" 
Global parameters
"""
target_diagnosis = "LUNG CA"

In [None]:
"""
Read in ADMISSIONS.csv and construct (1) patient ID to admission ID map, and \
    (2) admission ID to duration in hospital map (in seconds)
"""

pid_admids = {}
admid_duration = {}
admid_diagnosis = {}
admid_type = {}

admission_file = open("../mimiciii/ADMISSIONS.csv", 'r')
admission_file.readline()
for line in admission_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    admid = int(tokens[2])
    diagnosis = tokens[-3]
    adm_type = tokens[6][1:-1]
    if pid in pid_admids: pid_admids[pid].append(admid)
    else: pid_admids[pid] = [admid]

    admit_time = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    discharge_time = datetime.strptime(tokens[4], '%Y-%m-%d %H:%M:%S')
    admid_duration[admid] = (discharge_time-admit_time).total_seconds()
    admid_diagnosis[admid] = diagnosis[1:-1]
    admid_type[admid] = adm_type

admission_file.close()


In [None]:
""" 
Read in PATIENTS.csv and construct admission ID to gender map
"""
admid_gender = {}
patient_file = open("../mimiciii/PATIENTS.csv", 'r')
patient_file.readline()
for line in patient_file:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    gender = str(tokens[2])
    admids = pid_admids[pid]
    for admid in admids:
        admid_gender[admid] = gender[1]

patient_file.close()

In [None]:
""" 
Consruct admission ID to ICD codes mapping
"""

def convert_to_icd9(dxStr):
    """ 
    Source: https://github.com/mp2893/med2vec/blob/master/process_mimic.py
    I am not sure why ICD codes are converted in this way
    """
    if dxStr.startswith('E'):
        if len(dxStr) > 4: 
            return dxStr[:4] + '.' + dxStr[4:]
        else: 
            return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr

admid_codes = {}
diagnosis_file = open("../mimiciii/DIAGNOSES_ICD.csv", 'r')
diagnosis_file.readline()
for line in diagnosis_file: # read ADMISSIONS.CSV in order
    tokens = line.strip().split(',')
    admid = int(tokens[2])
    code = tokens[4][1:-1]
    # dxStr = 'D_' + convert_to_icd9(tokens[4][1:-1]) # 1:-1 to remove quotes

    if admid in admid_codes: 
        admid_codes[admid].append(code)
    else: 
        admid_codes[admid] = [code]

diagnosis_file.close()


In [None]:
""" 
Construct a dataframe to store all information including
- patient ID (index key)
- gender
- expire (mortality)
- list of ICD codes
"""

admid_diagnosis_df = pd.DataFrame(columns=['admid', 'adm_type', 'gender','ICD codes','duration', 'diagnosis', 'label'])

for admid, codes in admid_codes.items():
    new_row = {"admid": admid, "adm_type": admid_type[admid], "gender": admid_gender[admid], \
               "duration": admid_duration[admid], "ICD codes": codes, "diagnosis": admid_diagnosis[admid]}
    admid_diagnosis_df = pd.concat([admid_diagnosis_df, pd.DataFrame([new_row])], ignore_index=True)


admid_diagnosis_df = admid_diagnosis_df.set_index('admid')

In [9]:
# Transform ICD code to standard ICD-9 code

""" 
Consruct admission ID to ICD codes mapping
"""

def convert_to_icd9(dxStr):
    """ 
    Adapted from https://github.com/mp2893/med2vec/blob/master/process_mimic.py
    """
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr



In [14]:
# convert ICD to code to ICD-9
diagnose_path = "/home/wanxinli/EHR-OT/mimic_exp/mimiciii/D_ICD_DIAGNOSES.csv"
diagnose_df = pd.read_csv(diagnose_path, header=0, index_col=None)
diagnose_df

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...,...,...
14562,14432,V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
14563,14433,V740,Screening for cholera,Screening examination for cholera
14564,14434,V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
14565,14435,V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


In [20]:
# only run once
new_codes = []
for _, row in diagnose_df.iterrows():
    new_codes.append(convert_to_icd9(row['ICD9_CODE']))
diagnose_df['ICD code'] = new_codes
diagnose_df = diagnose_df.set_index('ICD code')
print(diagnose_df)
diagnose_df.to_csv(diagnose_path, index=True, header=True)

          ROW_ID ICD9_CODE               SHORT_TITLE  \
ICD code                                               
011.66       174     01166     TB pneumonia-oth test   
011.70       175     01170    TB pneumothorax-unspec   
011.71       176     01171   TB pneumothorax-no exam   
011.72       177     01172  TB pneumothorx-exam unkn   
011.73       178     01173  TB pneumothorax-micro dx   
...          ...       ...                       ...   
V73.99     14432     V7399     Scrn unspcf viral dis   
V74.0      14433      V740     Screening for cholera   
V74.1      14434      V741    Screening-pulmonary TB   
V74.2      14435      V742     Screening for leprosy   
V74.3      14436      V743  Screening for diphtheria   

                                                 LONG_TITLE  
ICD code                                                     
011.66    Tuberculous pneumonia [any form], tubercle bac...  
011.70                Tuberculous pneumothorax, unspecified  
011.71    Tuberculous p

In [None]:
from ast import literal_eval

admid_diagnosis_path = "../../outputs/mimic/ADMID_DIAGNOSIS.csv"
admid_diagnosis_df = pd.read_csv(admid_diagnosis_path, index_col=0, header=0)
admid_diagnosis_df['ICD codes'] = admid_diagnosis_df['ICD codes'].apply(literal_eval)
admid_diagnosis_df

In [None]:
""" 
Add target diagnosis label to admid_diagnosis_df
"""
for index, row in admid_diagnosis_df.iterrows():
    admid_diagnosis_df.at[index, 'ICD codes'] = [convert_to_icd9(code) for code in row['ICD codes']]
    if target_diagnosis in row['diagnosis']:
        admid_diagnosis_df.at[index, 'label'] = 1
    else:
        admid_diagnosis_df.at[index, 'label'] = 0
        
admid_diagnosis_df.to_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", header=True, index=True)
admid_diagnosis_df

In [None]:
""" 
Choose lung cancer to be the response
Filter out rows which does not contain target diagnosis in diagnosis
"""
admid_diagnosis_target_df = admid_diagnosis_df.loc[admid_diagnosis_df['diagnosis'].str.contains(target_diagnosis)]
admid_diagnosis_target_df 

In [None]:
""" 
Add target diagnosis label to admid_diagnosis_df
"""
for index, row in admid_diagnosis_df.iterrows():
    if target_diagnosis in row['diagnosis']:
        admid_diagnosis_df.at[index, 'label'] = 1
    else:
        admid_diagnosis_df.at[index, 'label'] = 0

In [None]:
admid_diagnosis_df.to_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", header=True, index=True)
admid_diagnosis_df