In [2]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


### This notebook transforms information in tables into points with "day" as smallest time unit


In [3]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

In [18]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'

PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD.csv.gz')
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD.csv.gz')
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS.csv.gz')
LABEVENTS = LABEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']]

In [5]:
N_PATIENTS = PATIENTS.shape[0]
N_PATIENTS 

In [6]:
chunksize = 10 ** 7
CHARTEVENTS_dfs = []
with pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS.csv.gz', chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        CHARTEVENTS_dfs.append(chunk[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUE', 'VALUENUM', 'VALUEUOM']])

### Load dictionary stuff

In [7]:
# mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'


D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

In [8]:
D_ITEMS.head()

In [100]:
D_LABITEMS.head()

In [9]:
CHARTEVENTS_dfs[0].head()

In [19]:
LABEVENTS.head()

In [10]:
CHARTEVENTS_dfs[0].ITEMID.value_counts()


### Select CHARTEVENTS with ITEMID covering at least 5% of all patients in the dataset

In [87]:
# For each chartevent item_id, store a set of patient that have at least one measurement of that type.
chartevents_item_patients = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_subject_df = df.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
    for item_id, subjects_df in item_subject_df.groupby('ITEMID'):
        chartevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
        
        

#### CONCLUSION: No duplicate info between LABEVENTS and CHARTEVENTS

In [88]:
print(len(chartevets_item_patients))
print(len(set(LABEVENTS.ITEMID)))
print(len(set(chartevets_item_patients.keys()) & set(LABEVENTS.ITEMID)))

In [89]:
chartevents_item_patients_count_df = pd.DataFrame({'ITEMID': chartevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, chartevents_item_patients.values())})
chartevents_item_patients_count_df['P_PATIENTS'] = chartevents_item_patients_count_df['N_PATIENTS'] / N_PATIENTS
chartevents_item_patients_count_df = chartevents_item_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
chartevents_item_patients_count_df

In [90]:
np.mean(chartevets_item_patients_count_df.P_PATIENTS > 0.05)

In [91]:
selected_chartevents_itemid_set = set(chartevents_item_patients_count_df[chartevents_item_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [92]:
len(selected_chart_events_itemid_set)

In [93]:
for i, df in enumerate(CHARTEVENTS_dfs):
    CHARTEVENTS_dfs[i] = df[df.ITEMID.isin(selected_chartevents_itemid_set)].reset_index(drop=True)

In [94]:
for i, df_chunk in enumerate(tqdm(CHARTEVENTS_dfs)):
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    
    # Add header if it is the first chunk
    header = i == 0

    df_chunk.to_csv(
        f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5.csv.gz', 
        compression='gzip', 
        index=False,
        header=header, 
        mode=mode)

### Select LABEVENTS with ITEMID covering at least 5% of all patients in the dataset

In [95]:
# For each labevents item_id, store a set of patient that have at least one measurement of that type.

labevents_item_patients = defaultdict(set)

labitem_subject_df = LABEVENTS.drop_duplicates(subset=['ITEMID', 'SUBJECT_ID'], ignore_index=True)
for item_id, subjects_df in labitem_subject_df.groupby('ITEMID'):
    labevents_item_patients[item_id].update(subjects_df.SUBJECT_ID.tolist())
    
labitem_patients_count_df = pd.DataFrame({'ITEMID': labevents_item_patients.keys(),
                                                 'N_PATIENTS': map(len, labevents_item_patients.values())})
labitem_patients_count_df['P_PATIENTS'] = labitem_patients_count_df['N_PATIENTS'] / N_PATIENTS

labitem_patients_count_df = labitem_patients_count_df.sort_values(by='N_PATIENTS', ascending=False)
labitem_patients_count_df

In [96]:
np.mean(labitem_patients_count_df.P_PATIENTS > 0.05)

In [97]:
selected_labevents_itemid_set = set(labitem_patients_count_df[labitem_patients_count_df.P_PATIENTS > 0.05].ITEMID)

In [98]:
LABEVENTS_Q5 = LABEVENTS[LABEVENTS.ITEMID.isin(selected_labevents_itemid_set)].reset_index(drop=True)
LABEVENTS_Q5.to_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5.csv.gz', 
                    compression='gzip', 
                    index=False,)

### Investigate the units used for each test type in CHARTEVENTS

In [72]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
chartevents_item_units = defaultdict(set)
for df in CHARTEVENTS_dfs:
    item_unit_df = df.drop_duplicates(subset=['ITEMID', 'VALUEUOM'], ignore_index=True)
    for item_id, units_df in item_unit_df.groupby('ITEMID'):
        units_df[units_df.VALUEUOM.isnull()] = ''
        chartevents_item_units[item_id].update(units_df.VALUEUOM)

In [73]:
chartevents_item_units_count_df = pd.DataFrame({'ITEMID': chartevents_item_units.keys(),
                                                 'N_UNITS': map(len, chartevents_item_units.values())})
chartevents_item_units_count_df = chartevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
chartevents_item_units_count_df

In [76]:
itemid_label = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))
itemid_category = dict(zip(D_ITEMS.ITEMID, D_ITEMS.LABEL))

chartevents_item_units_count_df

In [79]:
chartevents_item_tuples = []
for itemid in chartevents_item_units_count_df.ITEMID:
    for unit in chartevents_item_units[itemid]:
        chartevents_item_tuples.append((itemid, unit))
chartevents_units_df = pd.DataFrame(chartevents_item_tuples, columns=['ITEMID', 'VALUEUOM'])
chartevents_units_df['LABEL'] = chartevents_units_df.ITEMID.map(itemid_label)
chartevents_units_df['CATEGORY'] = chartevents_units_df.ITEMID.map(itemid_category)
chartevents_units_df.to_csv('chartevents_units_df.csv')
chartevents_units_df

### CONCLUSION: Units are consistent for each measurement type in CHARTEVENTS

### Investigate the units used for each test type in LABEVENTS

In [99]:
# Group each ITEMID with the set of used units (to detect unit incosistency for each unique test).
labevents_item_units = defaultdict(set)
lab_unit_df = LABEVENTS_Q5.drop_duplicates(subset=['ITEMID', 'VALUEUOM'], ignore_index=True)
for item_id, units_df in lab_unit_df.groupby('ITEMID'):
    units_df[units_df.VALUEUOM.isnull()] = ''
    labevents_item_units[item_id].update(units_df.VALUEUOM)

In [102]:
labevents_item_units_count_df = pd.DataFrame({'ITEMID': labevents_item_units.keys(),
                                                 'N_UNITS': map(len, labevents_item_units.values())})
labevents_item_units_count_df = labevents_item_units_count_df.sort_values(by='N_UNITS', ascending=False)
labevents_item_units_count_df

In [106]:
labitem_label = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.LABEL))
labitem_category = dict(zip(D_LABITEMS.ITEMID, D_LABITEMS.CATEGORY))
labitem_nunits = dict(zip(labevents_item_units_count_df.ITEMID, labevents_item_units_count_df.N_UNITS))

In [108]:
labevents_item_tuples = []
for itemid in labevents_item_units_count_df.ITEMID:
    for unit in labevents_item_units[itemid]:
        labevents_item_tuples.append((itemid, unit))
labevents_units_df = pd.DataFrame(labevents_item_tuples, columns=['ITEMID', 'VALUEUOM'])
labevents_units_df['LABEL'] = labevents_units_df.ITEMID.map(labitem_label)
labevents_units_df['CATEGORY'] = labevents_units_df.ITEMID.map(itemid_category)
labevents_units_df['N_UNITS'] = labevents_units_df.ITEMID.map(labitem_nunits)

labevents_units_df = labevents_units_df[labevents_units_df.N_UNITS > 1]
labevents_units_df.to_csv('labevents_units_df.csv')
labevents_units_df