In [6]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


### This notebook transforms information in tables into points with "day" as smallest time unit


In [7]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

In [8]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'

PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz', dtype = {'ICD9_CODE': str})
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz')

In [9]:
LABEVENTS = LABEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']]

In [10]:
N_PATIENTS = PATIENTS.shape[0]
N_PATIENTS 

In [5]:
chunksize = 10 ** 7
CHARTEVENTS_dfs = []
with pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        CHARTEVENTS_dfs.append(chunk[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']])

### Load dictionary stuff

In [11]:
# mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'


D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

itemid_label = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))
itemid_category = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))


labitem_label = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.LABEL))
labitem_category = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.CATEGORY))

In [12]:
D_ITEMS.head()

In [13]:
D_LABITEMS.head()

In [14]:
CHARTEVENTS_dfs[0].head()

In [15]:
LABEVENTS.head()

In [16]:
CHARTEVENTS_dfs[0].ITEMID.value_counts()


## (A) Select CHARTEVENTS with ITEMID covering at least 5% of all patients in the dataset

### (A-1) Drop non-numerical measurements

In [14]:
non_numeric_chartevents_dfs = []
non_numeric_chartevents_vals = defaultdict(set)
non_numeric_chartevents_units = defaultdict(set)

for i, chunk_df in enumerate(CHARTEVENTS_dfs):
    numeric_chunk_df = chunk_df[chunk_df.VALUENUM.notnull()].reset_index(drop=True)
    CHARTEVENTS_dfs[i] = numeric_chunk_df
    
    non_numeric_chunk_df = chunk_df[chunk_df.VALUENUM.isnull() & chunk_df.VALUE.notnull()].reset_index(drop=True)
    non_numeric_chartevents_dfs.append(non_numeric_chunk_df)
    for itemid, df in non_numeric_chunk_df.groupby(['ITEMID']):
        non_numeric_chartevents_vals[itemid].update(set(df.VALUE))
        non_numeric_chartevents_units[itemid].update(set(df.VALUEUOM))

non_numeric_chartevents_df = pd.DataFrame({'ITEMID': non_numeric_chartevents_vals.keys(),
                                       'LABEL': map(itemid_label.get, non_numeric_chartevents_vals.keys()),
                                       'CATEGORY': map(itemid_category.get, non_numeric_chartevents_vals.keys()),
                                       'VALS': map(lambda vals: "|".join(vals), non_numeric_chartevents_vals.values())})
non_numeric_chartevents_df.to_csv('non_numeric_chartevents_df.csv')

In [15]:
non_numeric_chartevents_units

### (A-2) Filter below 5% patients coverage

In [17]:
# For each chartevent item_id, store a set of patient that have at least one measurement of that type.
chartevents_item_patients = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_subject_df = df.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
    for item_id, subjects_df in item_subject_df.groupby('ITEMID'):
        chartevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
        
        

#### CONCLUSION: No duplicate info between LABEVENTS and CHARTEVENTS

In [18]:
print(len(chartevents_item_patients))
print(len(set(LABEVENTS.ITEMID)))
print(len(set(chartevents_item_patients.keys()) & set(LABEVENTS.ITEMID)))

In [19]:
chartevents_item_patients_count_df = pd.DataFrame({'ITEMID': chartevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, chartevents_item_patients.values())})
chartevents_item_patients_count_df['P_PATIENTS'] = chartevents_item_patients_count_df['N_PATIENTS'] / N_PATIENTS
chartevents_item_patients_count_df = chartevents_item_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
chartevents_item_patients_count_df

In [20]:
np.mean(chartevents_item_patients_count_df.P_PATIENTS > 0.05)

In [21]:
selected_chartevents_itemid_set = set(chartevents_item_patients_count_df[chartevents_item_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [22]:
len(selected_chartevents_itemid_set)

In [23]:
for i, df in enumerate(CHARTEVENTS_dfs):
    CHARTEVENTS_dfs[i] = df[df.ITEMID.isin(selected_chartevents_itemid_set)].reset_index(drop=True)

In [24]:
for i, df_chunk in enumerate(tqdm(CHARTEVENTS_dfs)):
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5.csv.gz', 
        compression='gzip', 
        index=False,
        header=header, 
        mode=mode)

## (B) Select LABEVENTS with ITEMID covering at least 5% of all patients in the dataset

### (B-1) Drop non-numerical measurements

In [26]:
non_numeric_labevents_vals = defaultdict(set)
non_numeric_labevents_units = defaultdict(set)

numeric_labevents_df = LABEVENTS[LABEVENTS.VALUENUM.notnull()].reset_index(drop=True)

non_numeric_labevents_df = chunk_df[chunk_df.VALUENUM.isnull() & chunk_df.VALUE.notnull()].reset_index(drop=True)

for itemid, df in non_numeric_labevents_df.groupby(['ITEMID']):
    non_numeric_labevents_vals[itemid] = set(df.VALUE)
    non_numeric_labevents_units[itemid] = set(df.VALUEUOM)

In [28]:
non_numeric_labevents_df = pd.DataFrame({'ITEMID': non_numeric_labevents_vals.keys(),
                                       'LABEL': map(itemid_label.get, non_numeric_labevents_vals.keys()),
                                       'CATEGORY': map(itemid_category.get, non_numeric_labevents_vals.keys()),
                                       'VALS': map(lambda vals: "|".join(vals), non_numeric_labevents_vals.values())})
                                
non_numeric_labevents_df.to_csv('non_numeric_labevents_df.csv')

In [29]:
# For each labevents item_id, store a set of patient that have at least one measurement of that type.

labevents_item_patients = defaultdict(set)

labitem_subject_df = numeric_labevents_df.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
for item_id, subjects_df in labitem_subject_df.groupby('ITEMID'):
    labevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
    
labitem_patients_count_df = pd.DataFrame({'ITEMID': labevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, labevents_item_patients.values())})
labitem_patients_count_df['P_PATIENTS'] = labitem_patients_count_df['N_PATIENTS'] / N_PATIENTS

labitem_patients_count_df = labitem_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
labitem_patients_count_df

In [30]:
np.mean(labitem_patients_count_df.P_PATIENTS > 0.05)

In [31]:
selected_labevents_itemid_set = set(labitem_patients_count_df[labitem_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [32]:
LABEVENTS_Q5 = numeric_labevents_df[numeric_labevents_df.ITEMID.isin(selected_labevents_itemid_set)].reset_index(drop=True)
LABEVENTS_Q5.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5.csv.gz', 
                    compression='gzip', 
                    index=False,)

In [33]:
len(selected_labevents_itemid_set)

## (C) Investigate the units used for each test type in CHARTEVENTS

### Load Filtered CHARTEVENTS (CHARTEVENTS_Q5)

In [35]:
CHARTEVENTS_Q5 = pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5.csv.gz')

### Investigate numerical/categorical measurements in CHARTEVENTS_Q5


In [40]:


# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
chartevents_item_units = defaultdict(dict)

for item_id, item_df in CHARTEVENTS_Q5.groupby('ITEMID'):
    item_df.loc[item_df.VALUEUOM.isnull(), 'VALUEUOM'] = ''
    for unit, unit_df in item_df.groupby('VALUEUOM'):
        vals_np = pd.to_numeric(unit_df.VALUENUM, errors='coerce')
        chartevents_item_units[item_id][unit] = (np.size(vals_np), unit_df.VALUENUM.mean(skipna=True), unit_df.VALUENUM.std(skipna=True))

In [43]:
chartevents_item_units_count_df = pd.DataFrame({'ITEMID': chartevents_item_units.keys(),
                                                'LABEL': map(itemid_label.get, chartevents_item_units.keys()),
                                                'CATEGORY': map(itemid_category.get, chartevents_item_units.keys()),
                                                'N_UNITS': map(len, chartevents_item_units.values())})
chartevents_item_units_count_df = chartevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
chartevents_item_units_count_df

In [47]:
chartevents_item_tuples = []
for itemid in chartevents_item_units_count_df.ITEMID:
    for unit in chartevents_item_units[itemid]:
        chartevents_item_tuples.append((itemid, unit))
chartevents_units_df = pd.DataFrame(chartevents_item_tuples, columns=['ITEMID', 'VALUEUOM'])
chartevents_units_df['LABEL'] = chartevents_units_df.ITEMID.map(itemid_label)
chartevents_units_df['CATEGORY'] = chartevents_units_df.ITEMID.map(itemid_category)
chartevents_units_df.to_csv('chartevents_units_df.csv')
chartevents_units_df


chartevents_item_tuples = []
for itemid in chartevents_item_units_count_df.ITEMID:
    for unit, (n, mean, std) in chartevents_item_units[itemid].items():
        chartevents_item_tuples.append((itemid, unit, n, mean, std))
chartevents_units_df = pd.DataFrame(chartevents_item_tuples, columns=['ITEMID', 'VALUEUOM', 'N', 'MEAN', 'STD'])
chartevents_units_df['LABEL'] = chartevents_units_df.ITEMID.map(itemid_label)
chartevents_units_df['CATEGORY'] = chartevents_units_df.ITEMID.map(itemid_category)

chartevents_units_df.to_csv('chartevents_units_df.csv')
chartevents_units_df


### CONCLUSION: Units are consistent for each measurement type in CHARTEVENTS

## (D) Investigate the units used for each test type in LABEVENTS

In [50]:
LABEVENTS_Q5 = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5.csv.gz')

In [51]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
labevents_item_units = defaultdict(dict)
for item_id, units_df in LABEVENTS_Q5.groupby('ITEMID'):
    units_df.loc[units_df.VALUEUOM.isnull(), 'VALUEUOM'] = ''
    for unit, vals_df in units_df.groupby('VALUEUOM'):
        vals_np = pd.to_numeric(vals_df.VALUENUM, errors='coerce')
        labevents_item_units[item_id][unit] = (np.size(vals_np), vals_df.VALUENUM.mean(skipna=True), vals_df.VALUENUM.std(skipna=True))

        

In [54]:
labevents_item_units_count_df = pd.DataFrame({'ITEMID': labevents_item_units.keys(),
                                              'LABEL': map(labitem_label.get, labevents_item_units.keys()),
                                              'CATEGORY': map(labitem_category.get, labevents_item_units.keys()),
                                              'N_UNITS': map(len, labevents_item_units.values())})
labevents_item_units_count_df = labevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
labevents_item_units_count_df

In [55]:

labitem_nunits = dict(zip(labevents_item_units_count_df.ITEMID, labevents_item_units_count_df.N_UNITS))

In [59]:
labevents_item_tuples = []
for itemid in labevents_item_units_count_df.ITEMID:
    for unit, (n, mean, std) in labevents_item_units[itemid].items():
        labevents_item_tuples.append((itemid, unit, n, mean, std))
labevents_units_df = pd.DataFrame(labevents_item_tuples, columns=['ITEMID', 'VALUEUOM', 'N', 'MEAN', 'STD'])
labevents_units_df['LABEL'] = labevents_units_df.ITEMID.map(labitem_label)
labevents_units_df['CATEGORY'] = labevents_units_df.ITEMID.map(labitem_category)
labevents_units_df['N_UNITS'] = labevents_units_df.ITEMID.map(labitem_nunits)

labevents_units_df.to_csv('labevents_units_df.csv')
labevents_units_df

### (D-1) Convert only units for (ITEMID=50889, C-Reactive Protein)

- Convert from mg/dL to mg/L

In [66]:
to_convert_units = ['MG/DL', 'mg/dL']
cond = (LABEVENTS_Q5.ITEMID == 50889) & (LABEVENTS_Q5.VALUEUOM.isin(to_convert_units))
LABEVENTS_Q5[cond]

In [None]:
LABEVENTS_Q5.loc[cond, 'VALUE'] = LABEVENTS_Q5.loc[cond, 'VALUE'] * 10
LABEVENTS_Q5.loc[cond, 'VALUEUOM'] = 'mg/L'
LABEVENTS_Q5.loc[cond, 'VALUENUM'] = LABEVENTS_Q5.loc[cond, 'VALUENUM'] * 10


In [69]:
LABEVENTS_Q5[cond]

In [70]:
LABEVENTS_Q5.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5_UNITS_FIXED.csv.gz', 
                    w,)