In [9]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


### This notebook transforms information in tables into points with "day" as smallest time unit


In [10]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

<module 'mimicnet.concept' from '/home/asem/GP/MIMIC-SNONET/mimicnet/concept.py'>

In [11]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'

PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz')
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz')
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz')

In [33]:
N_PATIENTS = PATIENTS.shape[0]
N_PATIENTS 

7537

In [15]:
chunksize = 10 ** 7
CHARTEVENTS_dfs = []
with pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        CHARTEVENTS_dfs.append(chunk[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']])
        # just for experiments

12it [02:24, 12.06s/it]


### Load dictionary stuff

In [16]:
# mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'


D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

In [17]:
D_ITEMS.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
0,457,497,Patient controlled analgesia (PCA) [Inject],,carevue,chartevents,,,,
1,458,498,PCA Lockout (Min),,carevue,chartevents,,,,
2,459,499,PCA Medication,,carevue,chartevents,,,,
3,460,500,PCA Total Dose,,carevue,chartevents,,,,
4,461,501,PCV Exh Vt (Obser),,carevue,chartevents,,,,


In [25]:
CHARTEVENTS_dfs[0].head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,36,223834,2134-05-12 12:00:00,15.0,15.0,L/min
1,36,223835,2134-05-12 12:00:00,100.0,100.0,
2,36,224328,2134-05-12 12:00:00,0.37,0.37,
3,36,224329,2134-05-12 12:00:00,6.0,6.0,min
4,36,224330,2134-05-12 12:00:00,2.5,2.5,


In [24]:
CHARTEVENTS_dfs[0].ITEMID.value_counts()


220045    782629
220210    774740
220277    754336
220181    389933
220179    389017
           ...  
220125         1
228181         1
227647         1
227039         1
228146         1
Name: ITEMID, Length: 470, dtype: int64

In [None]:
# For each chartevent item_id, store a set of patient that have at least one measurement of that type.
chartevets_item_patients = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_subject_df = df.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
    for item_id, subjects_df in item_subject_df.groupby('ITEMID'):
        chartevets_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
        
        

In [28]:
len(chartevets_item_patients)

4477

In [34]:
chartevets_item_patients_count_df = pd.DataFrame({'ITEMID': chartevets_item_patients.keys(),
                                                 'N_PATIENTS': map(len, chartevets_item_patients.values())})
chartevets_item_patients_count_df['P_PATIENTS'] = chartevets_item_patients_count_df['N_PATIENTS'] / N_PATIENTS
chartevets_item_patients_count_df = chartevets_item_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
chartevets_item_patients_count_df

Unnamed: 0,ITEMID,N_PATIENTS,P_PATIENTS
1035,926,5119,0.679183
617,211,5032,0.667640
479,31,5018,0.665782
516,80,4953,0.657158
1036,927,4891,0.648932
...,...,...,...
2583,2339,1,0.000133
2584,2368,1,0.000133
2585,2369,1,0.000133
2586,2408,1,0.000133


In [36]:
np.mean(chartevets_item_patients_count_df.P_PATIENTS > 0.05)

0.25642171096716554

### Select CHARTEVENTS with ITEMID covering at least 5% of all patients in the dataset

In [39]:
selected_chart_events_itemid_set = set(chartevets_item_patients_count_df[chartevets_item_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [40]:
len(selected_chart_events_itemid_set)

1148

### Investigate the units used for each test type

In [None]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
chartevets_item_units = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_unit_df = df.drop_duplicates(subset=['ITEMID', 'VALUEUOM'], ignore_index=True)
    for item_id, units_df in item_unit_df.groupby('ITEMID'):
        chartevets_item_units[item_id].update(units_df.VALUEUOM.tolist())

In [None]:
chartevets_item_units_count_df = pd.DataFrame({'ITEMID': chartevets_item_units.keys(),
                                                 'N_UNITS': map(len, chartevets_item_units.values())})
chartevets_item_patients_count_df['P_PATIENTS'] = chartevets_item_patients_count_df['N_PATIENTS'] / N_PATIENTS
chartevets_item_patients_count_df = chartevets_item_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
chartevets_item_patients_count_df