In [2]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


### This notebook transforms information in tables into points with "day" as smallest time unit


In [3]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

<module 'mimicnet.concept' from '/home/asem/GP/MIMIC-SNONET/mimicnet/concept.py'>

In [18]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'

PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz')
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz')
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz')
LABEVENTS = LABEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']]

In [5]:
N_PATIENTS = PATIENTS.shape[0]
N_PATIENTS 

7537

In [6]:
chunksize = 10 ** 7
CHARTEVENTS_dfs = []
with pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        CHARTEVENTS_dfs.append(chunk[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']])

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
12it [02:14, 11.22s/it]


### Load dictionary stuff

In [7]:
# mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'


D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

In [8]:
D_ITEMS.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
0,457,497,Patient controlled analgesia (PCA) [Inject],,carevue,chartevents,,,,
1,458,498,PCA Lockout (Min),,carevue,chartevents,,,,
2,459,499,PCA Medication,,carevue,chartevents,,,,
3,460,500,PCA Total Dose,,carevue,chartevents,,,,
4,461,501,PCV Exh Vt (Obser),,carevue,chartevents,,,,


In [100]:
D_LABITEMS.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,550,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,


In [9]:
CHARTEVENTS_dfs[0].head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,36,223834,2134-05-12 12:00:00,15.0,15.0,L/min
1,36,223835,2134-05-12 12:00:00,100.0,100.0,
2,36,224328,2134-05-12 12:00:00,0.37,0.37,
3,36,224329,2134-05-12 12:00:00,6.0,6.0,min
4,36,224330,2134-05-12 12:00:00,2.5,2.5,


In [19]:
LABEVENTS.head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,17,50960,2134-12-29 03:18:00,2.1,2.1,mg/dL
1,17,50970,2134-12-29 03:18:00,2.0,2.0,mg/dL
2,17,50971,2134-12-29 03:18:00,4.7,4.7,mEq/L
3,17,50983,2134-12-29 03:18:00,136.0,136.0,mEq/L
4,17,51006,2134-12-29 03:18:00,11.0,11.0,mg/dL


In [10]:
CHARTEVENTS_dfs[0].ITEMID.value_counts()


220045    782629
220210    774740
220277    754336
220181    389933
220179    389017
           ...  
220125         1
228181         1
227647         1
227039         1
228146         1
Name: ITEMID, Length: 470, dtype: int64

### Select CHARTEVENTS with ITEMID covering at least 5% of all patients in the dataset

In [87]:
# For each chartevent item_id, store a set of patient that have at least one measurement of that type.
chartevents_item_patients = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_subject_df = df.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
    for item_id, subjects_df in item_subject_df.groupby('ITEMID'):
        chartevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
        
        

#### CONCLUSION: No duplicate info between LABEVENTS and CHARTEVENTS

In [88]:
print(len(chartevets_item_patients))
print(len(set(LABEVENTS.ITEMID)))
print(len(set(chartevets_item_patients.keys()) & set(LABEVENTS.ITEMID)))

4477
697
0


In [89]:
chartevents_item_patients_count_df = pd.DataFrame({'ITEMID': chartevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, chartevents_item_patients.values())})
chartevents_item_patients_count_df['P_PATIENTS'] = chartevents_item_patients_count_df['N_PATIENTS'] / N_PATIENTS
chartevents_item_patients_count_df = chartevents_item_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
chartevents_item_patients_count_df

Unnamed: 0,ITEMID,N_PATIENTS,P_PATIENTS
479,926,5119,0.679183
229,211,5032,0.667640
158,31,5018,0.665782
173,80,4953,0.657158
473,916,4891,0.648932
...,...,...,...
57,223958,381,0.050551
991,225323,380,0.050418
840,224327,377,0.050020
90,224952,377,0.050020


In [90]:
np.mean(chartevets_item_patients_count_df.P_PATIENTS > 0.05)

0.25642171096716554

In [91]:
selected_chartevents_itemid_set = set(chartevents_item_patients_count_df[chartevents_item_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [92]:
len(selected_chart_events_itemid_set)

1148

In [93]:
for i, df in enumerate(CHARTEVENTS_dfs):
    CHARTEVENTS_dfs[i] = df[df.ITEMID.isin(selected_chartevents_itemid_set)].reset_index(drop=True)

In [94]:
for i, df_chunk in enumerate(tqdm(CHARTEVENTS_dfs)):
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5.csv.gz', 
        compression='gzip', 
        index=False,
        header=header, 
        mode=mode)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [09:25<00:00, 47.12s/it]


### Select LABEVENTS with ITEMID covering at least 5% of all patients in the dataset

In [95]:
# For each labevents item_id, store a set of patient that have at least one measurement of that type.

labevents_item_patients = defaultdict(set)

labitem_subject_df = LABEVENTS.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
for item_id, subjects_df in labitem_subject_df.groupby('ITEMID'):
    labevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
    
labitem_patients_count_df = pd.DataFrame({'ITEMID': labevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, labevents_item_patients.values())})
labitem_patients_count_df['P_PATIENTS'] = labitem_patients_count_df['N_PATIENTS'] / N_PATIENTS

labitem_patients_count_df = labitem_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
labitem_patients_count_df

Unnamed: 0,ITEMID,N_PATIENTS,P_PATIENTS
398,51221,7517,0.997346
478,51301,7505,0.995754
442,51265,7504,0.995622
454,51277,7503,0.995489
427,51250,7503,0.995489
...,...,...,...
569,51396,1,0.000133
582,51409,1,0.000133
587,51414,1,0.000133
610,51437,1,0.000133


In [96]:
np.mean(labitem_patients_count_df.P_PATIENTS > 0.05)

0.3500717360114778

In [97]:
selected_labevents_itemid_set = set(labitem_patients_count_df[labitem_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [98]:
LABEVENTS_Q5 = LABEVENTS[LABEVENTS.ITEMID.isin(selected_labevents_itemid_set)].reset_index(drop=True)
LABEVENTS_Q5.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5.csv.gz', 
                    compression='gzip', 
                    index=False,)

### Investigate the units used for each test type in CHARTEVENTS

In [72]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
chartevents_item_units = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_unit_df = df.drop_duplicates(subset=['ITEMID', 'VALUEUOM'], ignore_index=True)
    for item_id, units_df in item_unit_df.groupby('ITEMID'):
        units_df[units_df.VALUEUOM.isnull()] = ''
        chartevents_item_units[item_id].update(units_df.VALUEUOM)

In [73]:
chartevents_item_units_count_df = pd.DataFrame({'ITEMID': chartevents_item_units.keys(),
                                                 'N_UNITS': map(len, chartevents_item_units.values())})
chartevents_item_units_count_df = chartevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
chartevents_item_units_count_df

Unnamed: 0,ITEMID,N_UNITS
335,578,3
190,113,3
324,543,2
191,114,2
167,69,2
...,...,...
459,837,1
461,848,1
462,849,1
463,850,1


In [76]:
itemid_label = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))
itemid_category = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))

chartevents_item_units_count_df

Unnamed: 0,ITEMID,N_UNITS,LABEL,CATEGORY
335,578,3,Pressure Support,Pressure Support
190,113,3,CVP,CVP
324,543,2,Plateau Pressure,Plateau Pressure
191,114,2,CaO2,CaO2
167,69,2,BSA,BSA
...,...,...,...,...
459,837,1,Sodium (135-148),Sodium (135-148)
461,848,1,Total Bili (0-1.5),Total Bili (0-1.5)
462,849,1,Total Protein(6.5-8),Total Protein(6.5-8)
463,850,1,Triglyceride (0-200),Triglyceride (0-200)


In [79]:
chartevents_item_tuples = []
for itemid in chartevents_item_units_count_df.ITEMID:
    for unit in chartevents_item_units[itemid]:
        chartevents_item_tuples.append((itemid, unit))
chartevents_units_df = pd.DataFrame(chartevents_item_tuples, columns=['ITEMID', 'VALUEUOM'])
chartevents_units_df['LABEL'] = chartevents_units_df.ITEMID.map(itemid_label)
chartevents_units_df['CATEGORY'] = chartevents_units_df.ITEMID.map(itemid_category)
chartevents_units_df.to_csv('chartevents_units_df.csv')
chartevents_units_df

Unnamed: 0,ITEMID,VALUEUOM,LABEL,CATEGORY
0,578,.,Pressure Support,Pressure Support
1,578,,Pressure Support,Pressure Support
2,578,cmH20,Pressure Support,Pressure Support
3,113,,CVP,CVP
4,113,mmHg,CVP,CVP
...,...,...,...,...
1293,837,,Sodium (135-148),Sodium (135-148)
1294,848,,Total Bili (0-1.5),Total Bili (0-1.5)
1295,849,,Total Protein(6.5-8),Total Protein(6.5-8)
1296,850,,Triglyceride (0-200),Triglyceride (0-200)


### CONCLUSION: Units are consistent for each measurement type in CHARTEVENTS

### Investigate the units used for each test type in LABEVENTS

In [99]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
labevents_item_units = defaultdict(set)
lab_unit_df = LABEVENTS_Q5.drop_duplicates(subset=['ITEMID', 'VALUEUOM'], ignore_index=True)
for item_id, units_df in lab_unit_df.groupby('ITEMID'):
    units_df[units_df.VALUEUOM.isnull()] = ''
    labevents_item_units[item_id].update(units_df.VALUEUOM)

In [102]:
labevents_item_units_count_df = pd.DataFrame({'ITEMID': labevents_item_units.keys(),
                                                 'N_UNITS': map(len, labevents_item_units.values())})
labevents_item_units_count_df = labevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
labevents_item_units_count_df

Unnamed: 0,ITEMID,N_UNITS
51,50889,4
94,50964,3
106,50994,2
219,51464,2
62,50909,2
...,...,...
90,50955,1
91,50956,1
92,50960,1
95,50965,1


In [106]:
labitem_label = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.LABEL))
labitem_category = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.CATEGORY))
labitem_nunits = dict(zip(labevents_item_units_count_df.ITEMID, labevents_item_units_count_df.N_UNITS))

In [108]:
labevents_item_tuples = []
for itemid in labevents_item_units_count_df.ITEMID:
    for unit in labevents_item_units[itemid]:
        labevents_item_tuples.append((itemid, unit))
labevents_units_df = pd.DataFrame(labevents_item_tuples, columns=['ITEMID', 'VALUEUOM'])
labevents_units_df['LABEL'] = labevents_units_df.ITEMID.map(labitem_label)
labevents_units_df['CATEGORY'] = labevents_units_df.ITEMID.map(itemid_category)
labevents_units_df['N_UNITS'] = labevents_units_df.ITEMID.map(labitem_nunits)

labevents_units_df = labevents_units_df[labevents_units_df.N_UNITS > 1]
labevents_units_df.to_csv('labevents_units_df.csv')
labevents_units_df

Unnamed: 0,ITEMID,VALUEUOM,LABEL,CATEGORY,N_UNITS
0,50889,mg/L,C-Reactive Protein,,4
1,50889,mg/dL,C-Reactive Protein,,4
2,50889,MG/DL,C-Reactive Protein,,4
3,50889,,C-Reactive Protein,,4
4,50964,mOsm/kg,"Osmolality, Measured",,3
...,...,...,...,...,...
60,50924,ng/mL,Ferritin,,2
61,50937,,Hepatitis A Virus Antibody,,2
62,50937,Pos/Neg,Hepatitis A Virus Antibody,,2
63,50818,mm Hg,pCO2,,2
