In [1]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


In [2]:
mimic_dir = '/home/asem/GP/ehr-data/mimic4-v1.0/physionet.org/files/mimiciv/1.0'

In [3]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib

### Notes

#### TODO

1. From DIAGNOSES_ICD:
    - map to CCS (multi/single)
    - histogram of diseases (number of patients for each CCS code)
2. From ADMISSION and ICUSTAY:
    - #Admissions per patient
    - #ICUStays per patient
    - #ICUStays per admission
3. measurements: merge all measurements by HADM_ID (aggregate by median, 0.9 quantile, 0.1 quantile).

In [4]:
admissions = pd.read_csv(f'{mimic_dir}/core/admissions.csv.gz')

In [5]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,14679932,21038362,2139-09-26 14:16:00,2139-09-28 11:30:00,,ELECTIVE,,HOME,Other,ENGLISH,SINGLE,UNKNOWN,,,0
1,15585972,24941086,2123-10-07 23:56:00,2123-10-12 11:22:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0
2,11989120,21965160,2147-01-14 09:00:00,2147-01-17 14:25:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNKNOWN,,,0
3,17817079,24709883,2165-12-27 17:33:00,2165-12-31 21:18:00,,ELECTIVE,,HOME,Other,ENGLISH,,OTHER,,,0
4,15078341,23272159,2122-08-28 08:48:00,2122-08-30 12:32:00,,ELECTIVE,,HOME,Other,ENGLISH,,BLACK/AFRICAN AMERICAN,,,0


In [6]:
len(set(admissions.subject_id))

256878

In [7]:
patients_admission = defaultdict(set)

for row in admissions.itertuples():
    patients_admission[row.subject_id].add(row.hadm_id)
    
patients_admission_df = pd.DataFrame({'patient': patients_admission.keys(), 'n_admissions': map(len, patients_admission.values())})

In [8]:
patients_admission_count = list(map(lambda t: (t[0], len(t[1])), patients_admission.items()))

In [9]:
_, counts = zip(*patients_admission_count)

In [10]:
admissions_hist = np.bincount(counts)

In [11]:
admissions_hist

array([     0, 171080,  39248,  17035,   9265,   5515,   3547,   2424,
         1783,   1367,    990,    801,    626,    497,    420,    324,
          251,    200,    177,    148,    149,    101,    108,     75,
           70,     56,     52,     57,     40,     32,     40,     29,
           30,     29,     22,     29,     22,     19,     11,     12,
           14,     16,     10,      6,     13,      6,      5,      9,
            7,     11,      6,      5,      5,      7,      2,      6,
            4,      3,      5,      2,      2,      1,      2,      1,
            6,      1,      0,      2,      3,      3,      1,      3,
            1,      0,      0,      1,      1,      1,      1,      2,
            1,      0,      1,      0,      2,      1,      1,      0,
            1,      2,      0,      0,      1,      1,      2,      1,
            0,      1,      1,      1,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [12]:
# In GRAM paper, all patients with at least 2 visits were utilized (n=7537).
np.sum(patients_admission_df.n_admissions > 1)

85798

### Select patients with at least 2 visits

- Followed by GRAM paper.
- n=7537

In [13]:
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic4-multi-visit'

patients_multi_visits = set(patients_admission_df[patients_admission_df.n_admissions > 1].patient.tolist())

In [15]:
patients = pd.read_csv(f'{mimic_dir}/core/patients.csv.gz')
patients = patients[patients.subject_id.isin(patients_multi_visits)].reset_index(drop=True)
patients.to_csv(f'{multi_visit_mimic_dir}/patients.csv.gz', compression='gzip', index=False)
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10018928,F,31,2125,2008 - 2010,
2,10074117,F,55,2118,2008 - 2010,
3,10124191,F,27,2125,2011 - 2013,
4,10148710,M,67,2137,2008 - 2010,


In [16]:
patients = pd.read_csv(f'{multi_visit_mimic_dir}/patients.csv.gz')
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10018928,F,31,2125,2008 - 2010,
2,10074117,F,55,2118,2008 - 2010,
3,10124191,F,27,2125,2011 - 2013,
4,10148710,M,67,2137,2008 - 2010,


In [17]:
admissions = admissions[admissions.subject_id.isin(patients_multi_visits)].reset_index(drop=True)
admissions.to_csv(f'{multi_visit_mimic_dir}/admissions.csv.gz', compression='gzip', index=False)


In [20]:
diagnoses_icd = pd.read_csv(f'{mimic_dir}/hosp/diagnoses_icd.csv.gz', dtype = {'icd_code': str})
diagnoses_icd = diagnoses_icd[diagnoses_icd.subject_id.isin(patients_multi_visits)].reset_index(drop=True)
diagnoses_icd.to_csv(f'{multi_visit_mimic_dir}/diagnoses_icd.csv.gz', compression='gzip', index=False)


procedures_icd = pd.read_csv(f'{mimic_dir}/hosp/procedures_icd.csv.gz', dtype = {'icd_code': str})
procedures_icd = procedures_icd[procedures_icd.subject_id.isin(patients_multi_visits)].reset_index(drop=True)
procedures_icd.to_csv(f'{multi_visit_mimic_dir}/procedures_icd.csv.gz', compression='gzip', index=False)

