In [16]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict 
from functools import partial
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path

In [3]:
# mimic_dir = '/home/asem/GP/MIMIC-SNONET/RAW/mimic-iii-clinical-database-1.4'

mimic_dir = '/home/asem/GP/ehr-data/physionet.org/files/mimiciii/1.4'

In [4]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib


### Notes

#### TODO

1. From DIAGNOSES_ICD:
    - map to CCS (multi/single)
    - histogram of diseases (number of patients for each CCS code)
2. From ADMISSION and ICUSTAY:
    - #Admissions per patient
    - #ICUStays per patient
    - #ICUStays per admission
3. measurements: merge all measurements by HADM_ID (aggregate by median, 0.9 quantile, 0.1 quantile).

In [5]:
ADMISSIONS = pd.read_csv(f'{mimic_dir}/ADMISSIONS.csv.gz')

In [6]:
ADMISSIONS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [7]:
len(set(ADMISSIONS.SUBJECT_ID))

46520

In [8]:
patients_admission = defaultdict(set)

for row in ADMISSIONS.itertuples():
    patients_admission[row.SUBJECT_ID].add(row.HADM_ID)
    
patients_admission_df = pd.DataFrame({'patient': patients_admission.keys(), 'n_admissions': map(len, patients_admission.values())})

In [9]:
patients_admission_count = list(map(lambda t: (t[0], len(t[1])), patients_admission.items()))

In [10]:
_, counts = zip(*patients_admission_count)

In [11]:
admissions_hist = np.bincount(counts)

In [12]:
admissions_hist

array([    0, 38983,  5160,  1342,   508,   246,   113,    51,    31,
          26,    14,    13,     8,     5,     4,     1,     2,     3,
           0,     1,     1,     1,     1,     1,     1,     0,     0,
           0,     0,     0,     0,     1,     0,     0,     2,     0,
           0,     0,     0,     0,     0,     0,     1])

In [13]:
# In GRAM paper, all patients with at least 2 visits were utilized (n=7537).
np.sum(patients_admission_df.n_admissions > 1)

7537

### Select patients with at least 2 visits

- Followed by GRAM paper.
- n=7537

In [17]:
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-cohort'

Path(multi_visit_mimic_dir).mkdir(parents=True, exist_ok=True)
patients_multi_visits = set(patients_admission_df[patients_admission_df.n_admissions > 1].patient.tolist())

In [18]:
PATIENTS = pd.read_csv(f'{mimic_dir}/PATIENTS.csv.gz')
PATIENTS = PATIENTS[PATIENTS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
PATIENTS.to_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz', compression='gzip', index=False)
PATIENTS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,237,252,M,2078-03-06 00:00:00,,,,0
2,240,256,M,2086-07-31 00:00:00,,,,0
3,628,665,M,2052-05-20 00:00:00,2120-02-04 00:00:00,2120-02-04 00:00:00,,1
4,637,674,F,2113-12-14 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,1


In [19]:
PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
PATIENTS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,237,252,M,2078-03-06 00:00:00,,,,0
2,240,256,M,2086-07-31 00:00:00,,,,0
3,628,665,M,2052-05-20 00:00:00,2120-02-04 00:00:00,2120-02-04 00:00:00,,1
4,637,674,F,2113-12-14 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,1


In [20]:
ADMISSIONS = ADMISSIONS[ADMISSIONS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
ADMISSIONS.to_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz', compression='gzip', index=False)


In [21]:
DIAGNOSES_ICD = pd.read_csv(f'{mimic_dir}/DIAGNOSES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
DIAGNOSES_ICD = DIAGNOSES_ICD[DIAGNOSES_ICD.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
DIAGNOSES_ICD.to_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz', compression='gzip', index=False)


PROCEDURES_ICD = pd.read_csv(f'{mimic_dir}/PROCEDURES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
PROCEDURES_ICD = PROCEDURES_ICD[PROCEDURES_ICD.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
PROCEDURES_ICD.to_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz', compression='gzip', index=False)



In [22]:
LABEVENTS = pd.read_csv(f'{mimic_dir}/LABEVENTS.csv.gz')
LABEVENTS = LABEVENTS[LABEVENTS.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
LABEVENTS.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz', compression='gzip', index=False)

In [23]:
chunksize = 10 ** 7
filtered_dfs = []
with pd.read_csv(f'{mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        filtered_df = chunk[chunk.SUBJECT_ID.isin(patients_multi_visits)].reset_index(drop=True)
        filtered_dfs.append(filtered_df)


                        

  for obj in iterable:
15it [02:24,  9.62s/it]


KeyboardInterrupt: 

In [None]:
for i, df_chunk in enumerate(tqdm(filtered_dfs)):
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', 
        compression='gzip', 
        index=False,
        header=header, 
        mode=mode)