In [14]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


In [15]:
# mimic_dir = '/home/asem/GP/MIMIC-SNONET/RAW/mimic-iii-clinical-database-1.4'

mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'

In [16]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib


from mimicnet import dag

importlib.reload(sys.modules['mimicnet.dag'])

<module 'mimicnet.dag' from '/home/asem/GP/MIMIC-SNONET/mimicnet/dag.py'>

In [17]:
KG = dag.CCSDAG()

In [10]:

# LABEVENTS = pd.read_csv(f'{mimic_dir}/LABEVENTS.csv.gz')
# CHARTEVENTS = pd.read_csv(f'{mimic_dir}/CHARTEVENTS.csv.gz')

In [18]:
dir(KG)

['CCS_DIR',
 'DIAG_MULTI_CCS_FILE',
 'DIAG_SINGLE_CCS_FILE',
 'DIR',
 'PROC_MULTI_CCS_FILE',
 'PROC_SINGLE_CCS_FILE',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'ancestors_linkage',
 'common_parent',
 'diag_ccs_children_traversal',
 'diag_icd_codes',
 'diag_icd_label',
 'diag_multi_ccs2icd',
 'diag_multi_ccs_codes',
 'diag_multi_ccs_df',
 'diag_multi_ccs_pt2ch',
 'diag_multi_icd2ccs',
 'diag_single_ccs2icd',
 'diag_single_ccs_codes',
 'diag_single_ccs_df',
 'diag_single_icd2ccs',
 'digraph_from_dataframe',
 'find_diag_icd_name',
 'find_proc_icd_name',
 'get_ccs_parents',
 'get_diag_ccs_children',
 'get_diag_multi_ccs',
 'get_proc_ccs_children',
 'get_proc_multi_cc

In [22]:
len(KG.diag_multi_ccs_codes)

727

### Notes

#### TODO

1. From DIAGNOSES_ICD:
    - map to CCS (multi/single)
    - histogram of diseases (number of patients for each CCS code)
2. From ADMISSION and ICUSTAY:
    - #Admissions per patient
    - #ICUStays per patient
    - #ICUStays per admission
3. measurements: merge all measurements by HADM_ID (aggregate by median, 0.9 quantile, 0.1 quantile).

In [11]:
ADMISSIONS = pd.read_csv(f'{mimic_dir}/ADMISSIONS.csv.gz')

In [12]:
ADMISSIONS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [13]:
len(set(ADMISSIONS.SUBJECT_ID))

46520

In [9]:
patients_admission = defaultdict(set)

for row in ADMISSIONS.itertuples():
    patients_admission[row.SUBJECT_ID].add(row.HADM_ID)
    
patients_admission_df = pd.DataFrame({'patient': patients_admission.keys(), 'n_admissions': map(len, patients_admission.values())})

In [10]:
patients_admission_count = list(map(lambda t: (t[0], len(t[1])), patients_admission.items()))

In [11]:
_, counts = zip(*patients_admission_count)

In [12]:
admissions_hist = np.bincount(counts)

In [13]:
admissions_hist

array([    0, 38983,  5160,  1342,   508,   246,   113,    51,    31,
          26,    14,    13,     8,     5,     4,     1,     2,     3,
           0,     1,     1,     1,     1,     1,     1,     0,     0,
           0,     0,     0,     0,     1,     0,     0,     2,     0,
           0,     0,     0,     0,     0,     0,     1])

In [14]:
# In GRAM paper, all patients with at least 2 visits were utilized (n=7537).
np.sum(patients_admission_df.n_admissions > 1)

7537

### Select patients with at least 2 visits

- Followed by GRAM paper.
- n=7537

In [15]:
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'

patients_multi_visits = set(patients_admission_df[patients_admission_df.n_admissions > 1].patient.tolist())

In [16]:
PATIENTS = pd.read_csv(f'{mimic_dir}/PATIENTS.csv.gz')
PATIENTS = PATIENTS[PATIENTS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
PATIENTS.to_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz', compression='gzip', index=False)
PATIENTS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,237,252,M,2078-03-06 00:00:00,,,,0
2,240,256,M,2086-07-31 00:00:00,,,,0
3,628,665,M,2052-05-20 00:00:00,2120-02-04 00:00:00,2120-02-04 00:00:00,,1
4,637,674,F,2113-12-14 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,1


In [17]:
PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
PATIENTS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,237,252,M,2078-03-06 00:00:00,,,,0
2,240,256,M,2086-07-31 00:00:00,,,,0
3,628,665,M,2052-05-20 00:00:00,2120-02-04 00:00:00,2120-02-04 00:00:00,,1
4,637,674,F,2113-12-14 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,1


In [18]:
ADMISSIONS = ADMISSIONS[ADMISSIONS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
ADMISSIONS.to_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz', compression='gzip', index=False)


In [19]:
DIAGNOSES_ICD = pd.read_csv(f'{mimic_dir}/DIAGNOSES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
DIAGNOSES_ICD = DIAGNOSES_ICD[DIAGNOSES_ICD.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
DIAGNOSES_ICD.to_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz', compression='gzip', index=False)


PROCEDURES_ICD = pd.read_csv(f'{mimic_dir}/PROCEDURES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
PROCEDURES_ICD = PROCEDURES_ICD[PROCEDURES_ICD.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
PROCEDURES_ICD.to_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz', compression='gzip', index=False)



In [20]:
LABEVENTS = pd.read_csv(f'{mimic_dir}/LABEVENTS.csv.gz')
LABEVENTS = LABEVENTS[LABEVENTS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
LABEVENTS.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz', compression='gzip', index=False)

In [21]:
chunksize = 10 ** 7
filtered_dfs = []
with pd.read_csv(f'{mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        filtered_df = chunk[chunk.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
        filtered_dfs.append(filtered_df)


                        

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
34it [04:47,  8.46s/it]


In [22]:
for i, df_chunk in enumerate(tqdm(filtered_dfs)):
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', 
        compression='gzip', 
        index=False,
        header=header, 
        mode=mode)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [14:15<00:00, 25.16s/it]
