In [1]:
from __future__ import print_function, division

%load_ext autoreload

import sys
sys.path.append('..')

In [2]:
%autoreload

import copy, os, pickle, psycopg2, pandas as pd, numpy as np

from mimic_direct_extract import (
    save_numerics, get_values_by_name_from_df_column_or_index, get_variable_mapping, get_variable_ranges
)
from datapackage_io_util import (
    load_datapackage_schema,
    load_sanitized_df_from_csv,
    save_sanitized_df_to_csv,
    sanitize_df,
)

# Build Data

In [3]:
with open('../resources/testing_schemas.pkl', mode='rb') as f:
    schema_data, schema_X, schema_I, schema_var_ranges, schema_var_map, schema_got_out = pickle.load(f)

In [4]:
GENDER, ETHNICITY, AGE      = 'U', 'U', 40
DBSOURCE, LINKSTO, CATEGORY = 'TEST', 'TEST', 'TEST'
ADMITTIME, DISCHTIME        = '2100-10-01 00:00:00', '2100-11-01 00:00:00'
DEATHTIME                   = '2101-10-01 00:00:00'
LOS_ICU, ADMISSION_TYPE     = 31, 'U'
FIRST_CAREUNIT, MORT_ICU    = 'U', 0
MORT_HOSP                   = 0
HOSPITAL_EXPIRE_FLAG        = 0
HOSPSTAY_SEQ                = 1

DATETIME_COLS = set([
    'charttime', 'admittime', 'dischtime', 'deathtime', 'intime', 'outtime'
])
def make_datetime(df):
    for col in set(df.columns).intersection(DATETIME_COLS): df[col] = pd.to_datetime(df[col])
    return df


def build_sample_data(
    subject_id, hadm_id, icustay_id, itemid,
    itemid_label, itemid_unitname, level2, level1,
#     gender, ethnicity, age,
    intime, outtime,
    charttime_hour1,
    value_hour1,
    valueuom_hour1,
    outlier_low=np.NaN,
    valid_low=np.NaN,
    impute=np.NaN,
    valid_high=np.NaN,
    outlier_high=np.NaN
):
    """TODO(mmd): Generalize (slightly!!!)"""
    X = schema_X.copy()
    X = make_datetime(X.append(
        {
            'subject_id': subject_id,
            'hadm_id':    hadm_id,
            'icustay_id': icustay_id,
            'charttime':  charttime_hour1,
            'itemid':     itemid,
            'value':      value_hour1,
            'valueuom':   valueuom_hour1,
        },
        ignore_index = True,
    ))
    
    
    I = schema_I.copy()
    I = I.append(
        pd.DataFrame(
            {
                'label':    itemid_label,
                'dbsource': DBSOURCE,
                'linksto':  LINKSTO,
                'category': CATEGORY,
                'unitname': itemid_unitname,
            },
            index = pd.Index([itemid], name='itemid'),
        )
    )

    data = schema_data.copy()
    data = make_datetime(data.append(
        {
            'subject_id':           subject_id,
            'hadm_id':              hadm_id,
            'gender':               GENDER,
            'ethnicity':            ETHNICITY,
            'age':                  AGE,
            'admittime':            ADMITTIME,
            'dischtime':            DISCHTIME,
            'deathtime':            DEATHTIME,
            'intime':               intime,
            'outtime':              outtime,
            'los_icu':              LOS_ICU,
            'admission_type':       ADMISSION_TYPE,
            'first_careunit':       FIRST_CAREUNIT,
            'mort_icu':             MORT_ICU,
            'mort_hosp':            MORT_HOSP,
            'hospital_expire_flag': HOSPITAL_EXPIRE_FLAG,
            'hospstay_seq':         HOSPSTAY_SEQ,
        },
        ignore_index = True
    ))
    data.index = [icustay_id]
    data.index.names = ['icustay_id']
    
    
    var_map_columns = [
        'LEVEL2', 'LEVEL1', 'ITEMID'
    ]
    var_map = pd.DataFrame(
        [[level2, level1, itemid]],
        columns = var_map_columns,
#         dtype   = schema_var_map[var_map_columns].dtypes.to_dict(),
    )
    
    # TODO(mmd): var_range
    var_range = schema_var_ranges.copy()
    var_range = var_range.append(
                        pd.DataFrame(
                            {
                                'OUTLIER_LOW':    outlier_low,
                                'VALID_LOW': valid_low,
                                'IMPUTE':  impute,
                                'VALID_HIGH': valid_high,
                                'OUTLIER_HIGH': outlier_high,
                            },
                            index = pd.Index([level2], name='itemid'),
                        )
                    )
    return X, data, I, var_map, var_range

def build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1,
    aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 3, np.NaN, np.NaN, np.NaN, np.NaN],
        'std':   [np.NaN, 0, np.NaN, np.NaN, np.NaN, np.NaN]
    },
    level2 = 'test_level2'
):
    tmp = pd.DataFrame(aggregation_functions)
    tmp['subject_id'] = subject_id
    tmp['hadm_id']    = hadm_id
    tmp['icustay_id'] = icustay_id
    tmp['hours_in']   = np.arange(len(tmp))
    tmp.set_index(['subject_id', 'hadm_id', 'icustay_id', 'hours_in'], inplace=True)
    tmp.columns = pd.MultiIndex.from_tuples(
        [(level2, c) for c in tmp.columns],
        names=('LEVEL2', 'Aggregation Function')
    )
    
    return tmp

def build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1,
    aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 3, np.NaN, np.NaN, np.NaN, np.NaN],
        'std':   [np.NaN, 0, np.NaN, np.NaN, np.NaN, np.NaN]
    },
    level2 = 'test_level2',
    level1 = 'test_level1',
    itemid = 1,
    label = 'test'
):
    tmp = pd.DataFrame(aggregation_functions)
    tmp['subject_id'] = subject_id
    tmp['hadm_id']    = hadm_id
    tmp['icustay_id'] = icustay_id
    tmp['hours_in']   = np.arange(len(tmp))
    tmp.set_index(['subject_id', 'hadm_id', 'icustay_id', 'hours_in'], inplace=True)
    tmp.columns = pd.MultiIndex.from_tuples(
        [(itemid, label, level1, level2, c) for c in tmp.columns],
        names=('itemid','label','LEVEL1','LEVEL2', 'Aggregation Function')
    )
    
    return tmp

## Multiple observations data

In [5]:
X, data, I, var_map, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:15:00',
    value_hour1 = '3',
    valueuom_hour1 = 'm',
)

X_2, _, _, _ , _= build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:35:00',
    value_hour1 = '5',
    valueuom_hour1 = 'm',
)

X_3, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 03:35:00',
    value_hour1 = '6',
    valueuom_hour1 = 'm',
)

X = pd.concat([X, X_2, X_3])

# pd grouping calls std NaNs when there is only 1 obs. TODO Probably want to fix that.
# pd grouping calls counts NaNs when zero for some reason. TODO Probably want to fix that.
expected_out_level2 = build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 2, 0, 1, 0, 0], 'mean': [np.NaN, 4, np.NaN, 6, np.NaN, np.NaN],
         'std':   [np.NaN, np.sqrt(2), np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'test_level2'
)

expected_out_no_group = build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 2, 0, 1, 0, 0], 'mean': [np.NaN, 4, np.NaN, 6, np.NaN, np.NaN],
         'std':   [np.NaN, np.sqrt(2), np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'test_level2',
    level1 = 'test_level1',
    itemid = '1',
    label = 'test'
)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
X

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,1,1,1,2100-10-01 01:15:00,1,3,m
0,1,1,1,2100-10-01 01:35:00,1,5,m
0,1,1,1,2100-10-01 03:35:00,1,6,m


In [7]:
data

Unnamed: 0_level_0,subject_id,hadm_id,gender,ethnicity,age,admittime,dischtime,deathtime,intime,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,1,U,U,40.0,2100-10-01,2100-11-01,2101-10-01,2100-10-01,2100-10-01 05:00:00,31.0,U,U,0,0,0,1


In [8]:
I

Unnamed: 0_level_0,category,dbsource,label,linksto,unitname
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,TEST,TEST,test,TEST,


In [9]:
var_map

Unnamed: 0,LEVEL2,LEVEL1,ITEMID
0,test_level2,test_level1,1


In [10]:
expected_out_level2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,test_level2,test_level2,test_level2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,0,0,,
1,1,1,1,2,4.0,1.414214
1,1,1,2,0,,
1,1,1,3,1,6.0,
1,1,1,4,0,,
1,1,1,5,0,,


In [11]:
expected_out_no_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,1,1,1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,test,test,test
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,test_level1,test_level1,test_level1
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,test_level2,test_level2,test_level2
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
1,1,1,0,0,,
1,1,1,1,2,4.0,1.414214
1,1,1,2,0,,
1,1,1,3,1,6.0,
1,1,1,4,0,,
1,1,1,5,0,,


## Outlier Dectection Data

In [12]:
X_outlier, _, _, _, var_range_outlier = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:15:00',
    value_hour1 = '0',
    valueuom_hour1 = 'm',
    outlier_low = 1,
    valid_low = 3,
    impute = np.NaN,
    valid_high = 10,
    outlier_high = 15
)

X_2, _, _, _ , _= build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:35:00',
    value_hour1 = '2',
    valueuom_hour1 = 'm',
)

X_3, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 03:35:00',
    value_hour1 = '5',
    valueuom_hour1 = 'm',
)

X_4, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 03:50:00',
    value_hour1 = '12',
    valueuom_hour1 = 'm',
)

X_5, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'test', itemid_unitname = 'N/A', level2 = 'test_level2', level1 = 'test_level1',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 04:10:00',
    value_hour1 = '18',
    valueuom_hour1 = 'm',
)

X_outlier = pd.concat([X_outlier, X_2, X_3, X_4, X_5])

# pd grouping calls std NaNs when there is only 1 obs. TODO Probably want to fix that.
# pd grouping calls counts NaNs when there is no observation;
# pd grouping calls counts NaNs when there is only NaN observation. TODO Probably want to fix that.
expected_out_detection = build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 2, 0, 0], 'mean': [np.NaN, 3, np.NaN, 7.5, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.sqrt(12.5), np.NaN, np.NaN]
    }, level2 = 'test_level2'
)

expected_out_no_detection = build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 2, 0, 2, 1, 0], 'mean': [np.NaN, 1, np.NaN, 8.5, 18, np.NaN],
         'std':   [np.NaN, np.sqrt(2), np.NaN, np.sqrt(24.5), np.NaN, np.NaN]
    }, level2 = 'test_level2'
)

In [13]:
X_outlier

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,1,1,1,2100-10-01 01:15:00,1,0,m
0,1,1,1,2100-10-01 01:35:00,1,2,m
0,1,1,1,2100-10-01 03:35:00,1,5,m
0,1,1,1,2100-10-01 03:50:00,1,12,m
0,1,1,1,2100-10-01 04:10:00,1,18,m


In [14]:
var_range_outlier

Unnamed: 0,IMPUTE,OUTLIER_HIGH,OUTLIER_LOW,VALID_HIGH,VALID_LOW
test_level2,,15.0,1.0,10.0,3.0


In [15]:
expected_out_detection

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,test_level2,test_level2,test_level2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,0,0,,
1,1,1,1,1,3.0,
1,1,1,2,0,,
1,1,1,3,2,7.5,3.535534
1,1,1,4,0,,
1,1,1,5,0,,


In [16]:
expected_out_no_detection

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,test_level2,test_level2,test_level2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,0,0,,
1,1,1,1,2,1.0,1.414214
1,1,1,2,0,,
1,1,1,3,2,8.5,4.949747
1,1,1,4,1,18.0,
1,1,1,5,0,,


## Multi-level Data

In [17]:
X_multi_level, _, I_multi_level, var_map_multi_level, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'eye', itemid_unitname = 'N/A', level2 = 'face', level1 = 'eyes',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:15:00',
    value_hour1 = '3',
    valueuom_hour1 = 'm',
)

X_2, _, I_2, var_map_2, _= build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '2',
    itemid_label = 'nose', itemid_unitname = 'N/A', level2 = 'face', level1 = 'nose',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:35:00',
    value_hour1 = '5',
    valueuom_hour1 = 'm',
)

X_3, _, I_3, var_map_3, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '3',
    itemid_label = 'arm', itemid_unitname = 'N/A', level2 = 'body', level1 = 'arms',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:25:00',
    value_hour1 = '6',
    valueuom_hour1 = 'm',
)

X_multi_level = pd.concat([X_multi_level, X_2, X_3])
I_multi_level = pd.concat([I_multi_level, I_2, I_3])
var_map_multi_level = pd.concat([var_map_multi_level, var_map_2, var_map_3])


# pd grouping calls std NaNs when there is only 1 obs. TODO Probably want to fix that.
# pd grouping calls counts NaNs when zero for some reason. TODO Probably want to fix that.
expected_out_level2_multi = pd.concat([build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 6, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'body'
),
                                      build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 2, 0, 0, 0, 0], 'mean': [np.NaN, 4, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.sqrt(2), np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'face'
)],axis=1)         



expected_out_no_group_multi = pd.concat([build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 3, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'face',
    level1 = 'eyes',
    itemid = '1',
    label = 'eye'
),
                                         build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 5, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'face',
    level1 = 'nose',
    itemid = '2',
    label = 'nose'
),
                                         build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 6, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'body',
    level1 = 'arms',
    itemid = '3',
    label = 'arm'
)],axis=1)

In [18]:
X_multi_level

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,1,1,1,2100-10-01 01:15:00,1,3,m
0,1,1,1,2100-10-01 01:35:00,2,5,m
0,1,1,1,2100-10-01 01:25:00,3,6,m


In [19]:
I_multi_level

Unnamed: 0_level_0,category,dbsource,label,linksto,unitname
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,TEST,TEST,eye,TEST,
2,TEST,TEST,nose,TEST,
3,TEST,TEST,arm,TEST,


In [20]:
var_map_multi_level

Unnamed: 0,LEVEL2,LEVEL1,ITEMID
0,face,eyes,1
0,face,nose,2
0,body,arms,3


In [21]:
expected_out_level2_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,body,body,body,face,face,face
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,1,1,0,0,,,0,,
1,1,1,1,1,6.0,,2,4.0,1.414214
1,1,1,2,0,,,0,,
1,1,1,3,0,,,0,,
1,1,1,4,0,,,0,,
1,1,1,5,0,,,0,,


In [22]:
expected_out_no_group_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,1,1,1,2,2,2,3,3,3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,eye,eye,eye,nose,nose,nose,arm,arm,arm
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,eyes,eyes,eyes,nose,nose,nose,arms,arms,arms
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,face,face,face,face,face,face,body,body,body
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
1,1,1,0,0,,,0,,,0,,
1,1,1,1,1,3.0,,1,5.0,,1,6.0,
1,1,1,2,0,,,0,,,0,,
1,1,1,3,0,,,0,,,0,,
1,1,1,4,0,,,0,,,0,,
1,1,1,5,0,,,0,,,0,,


# Missingness Data

In [23]:
X_missing, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'eye', itemid_unitname = 'N/A', level2 = 'face', level1 = 'eyes',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:15:00',
    value_hour1 = '3',
    valueuom_hour1 = 'm',
)

X_2, _, _, _, _= build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '2',
    itemid_label = 'nose', itemid_unitname = 'N/A', level2 = 'face', level1 = 'nose',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 02:35:00',
    value_hour1 = '5',
    valueuom_hour1 = 'm',
)

X_3, _, _, _, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '3',
    itemid_label = 'arm', itemid_unitname = 'N/A', level2 = 'body', level1 = 'arms',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:25:00',
    value_hour1 = '6',
    valueuom_hour1 = 'm',
)

X_missing = pd.concat([X_missing, X_2, X_3])

# pd grouping calls std NaNs when there is only 1 obs. TODO Probably want to fix that.
expected_out_missing = build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 1, 0, 0, 0], 'mean': [np.NaN, 3, 5, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'face'
)         

In [24]:
X_missing

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,1,1,1,2100-10-01 01:15:00,1,3,m
0,1,1,1,2100-10-01 02:35:00,2,5,m
0,1,1,1,2100-10-01 01:25:00,3,6,m


In [25]:
expected_out_missing

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,face,face,face
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,0,0,,
1,1,1,1,1,3.0,
1,1,1,2,1,5.0,
1,1,1,3,0,,
1,1,1,4,0,,
1,1,1,5,0,,


## Unit conversion data

In [26]:
X_unit, _, I_unit, var_map_unit, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'weight_oz', itemid_unitname = 'oz', level2 = 'weight', level1 = 'weight',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:15:00',
    value_hour1 = '35.274',
    valueuom_hour1 = 'oz',
)

X_2, _, _, _ , _= build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '1',
    itemid_label = 'weight_oz', itemid_unitname = 'oz', level2 = 'weight', level1 = 'weight',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 03:35:00',
    value_hour1 = '352.74',
    valueuom_hour1 = 'oz',
)

X_3, _, I_3, var_map_3, _ = build_sample_data(
    subject_id = 1, hadm_id = 1, icustay_id = 1, itemid = '3',
    itemid_label = 'weight_kg', itemid_unitname = 'kg', level2 = 'weight', level1 = 'weight',
    intime = '2100-10-01 00:00:00', outtime = '2100-10-01 05:00:00',
    charttime_hour1 = '2100-10-01 01:35:00',
    value_hour1 = '10',
    valueuom_hour1 = 'kg',
)

X_unit = pd.concat([X_unit, X_2, X_3])
I_unit = pd.concat([I_unit, I_3])
var_map_unit = pd.concat([var_map_unit, var_map_3])


# pd grouping calls std NaNs when there is only 1 obs. TODO Probably want to fix that.
# pd grouping calls counts NaNs when zero for some reason. TODO Probably want to fix that.
expected_out_unit_level2 = build_lvl2_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 2, 0, 1, 0, 0], 'mean': [np.NaN, 5.5, np.NaN, 10, np.NaN, np.NaN],
         'std':   [np.NaN, np.sqrt(40.5), np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'weight'
)

expected_out_unit_no_group = pd.concat([build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 1, 0, 0], 'mean': [np.NaN, 1, np.NaN, 10, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'weight',
    level1 = 'weight',
    itemid = '1',
    label = 'weight_oz'
),
                                        build_nogroup_out(
    subject_id = 1, hadm_id = 1, icustay_id = 1, aggregation_functions={
        'count': [0, 1, 0, 0, 0, 0], 'mean': [np.NaN, 10, np.NaN, np.NaN, np.NaN, np.NaN],
         'std':   [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
    }, level2 = 'weight',
    level1 = 'weight',
    itemid = '3',
    label = 'weight_kg'
)],axis=1)

In [27]:
X_unit

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom
0,1,1,1,2100-10-01 01:15:00,1,35.274,oz
0,1,1,1,2100-10-01 03:35:00,1,352.74,oz
0,1,1,1,2100-10-01 01:35:00,3,10.0,kg


In [28]:
I_unit

Unnamed: 0_level_0,category,dbsource,label,linksto,unitname
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,TEST,TEST,weight_oz,TEST,oz
3,TEST,TEST,weight_kg,TEST,kg


In [29]:
var_map_unit

Unnamed: 0,LEVEL2,LEVEL1,ITEMID
0,weight,weight,1
0,weight,weight,3


In [30]:
expected_out_unit_level2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,weight,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,1,1,0,0,,
1,1,1,1,2,5.5,6.363961
1,1,1,2,0,,
1,1,1,3,1,10.0,
1,1,1,4,0,,
1,1,1,5,0,,


In [31]:
expected_out_unit_no_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,1,1,1,3,3,3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,weight_oz,weight_oz,weight_oz,weight_kg,weight_kg,weight_kg
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5
1,1,1,0,0,,,0,,
1,1,1,1,1,1.0,,1,10.0,
1,1,1,2,0,,,0,,
1,1,1,3,1,10.0,,0,,
1,1,1,4,0,,,0,,
1,1,1,5,0,,,0,,


# Tests

In [32]:
BASE_PARAMS = {
    'outPath':              None, # Should probably be out_path?
    'columns_filename':     None,
    'subjects_filename':    None,
    'times_filename':       None,
    'dynamic_filename':     None,
    'dynamic_hd5_filename': None,
    'group_by_level2':      True,
    'apply_var_limit':      True,
    'min_percent':          0,
}

TEST_CASES = [
    (
        {
            'data':       schema_data,
            'X':          schema_X,
            'I':          schema_I,
            'var_map':    schema_var_map,
            'var_ranges': schema_var_ranges,
        }, 
        True,
        None,
        "Empty Input"
    ),
    (
        {
            'data':       data,
            'X':          X,
            'I':          I,
            'var_map':    var_map,
            'var_ranges': schema_var_ranges,
        }, 
        False,
        expected_out_level2,
        "Multiple Observation with Grouping"
    ),
    (
        {
            'data':            data,
            'X':               X,
            'I':               I,
            'var_map':         var_map,
            'var_ranges':      schema_var_ranges,
            'group_by_level2': False,
        }, 
        False,
        expected_out_no_group,
        "Multiple Observation without Grouping"
    ),
    (
        {
            'data':            data,
            'X':               X_outlier,
            'I':               I,
            'var_map':         var_map,
            'apply_var_limit': False,
            'var_ranges':      var_range_outlier,
        }, 
        False,
        expected_out_no_detection,
        "Outlier Dectection Applied"
    ),
    (
        {
            'data':            data,
            'X':               X_outlier,
            'I':               I,
            'var_map':         var_map,
            'var_ranges':      var_range_outlier,
        }, 
        False,
        expected_out_detection,
        "Outlier Dectection Not Applied"
    ),
        (
        {
            'data':            data,
            'X':               X_multi_level,
            'I':               I_multi_level,
            'var_map':         var_map_multi_level,
            'var_ranges':      schema_var_ranges,
        }, 
        False,
        expected_out_level2_multi,
        "Multiple Level 2 With Grouping"
    ),
    (
        {
            'data':            data,
            'X':               X_multi_level,
            'I':               I_multi_level,
            'var_map':         var_map_multi_level,
            'var_ranges':      schema_var_ranges,
            'group_by_level2': False,

        }, 
        False,
        expected_out_no_group_multi,
        "Multiple Level 2 No Grouping"
    ),
    (
        {
            'data':            data,
            'X':               X_missing,
            'I':               I_multi_level,
            'var_map':         var_map_multi_level,
            'var_ranges':      schema_var_ranges,
            'min_percent':     30,

        }, 
        False,
        expected_out_missing,
        "Missing"
    ),
    (
        {
            'data':            data,
            'X':               X_unit,
            'I':               I_unit,
            'var_map':         var_map_unit,
            'var_ranges':      schema_var_ranges,

        }, 
        False,
        expected_out_unit_level2,
        "Unit conversion - grouping by level2"
    ),
    (
        {
            'data':            data,
            'X':               X_unit,
            'I':               I_unit,
            'var_map':         var_map_unit,
            'var_ranges':      schema_var_ranges,
            'group_by_level2': False,
        }, 
        False,
        expected_out_unit_no_group,
        "Unit conversion - no grouping"
    ),
]

for test_case, (test_inputs, expect_error, expected_output, name) in enumerate(TEST_CASES):
    pass_msg = "Passed Test %d: %s" % (test_case, name)
    fail_msg = "Failed Test %d: %s" % (test_case, name)

    inputs = copy.copy(BASE_PARAMS)
    for k, v in test_inputs.items(): inputs[k] = v
    
    try: got_out = save_numerics(**inputs)
    except Exception as e:
        if expect_error:
            print(pass_msg)
            continue
        else: 
            print('\n'.join((fail_msg, "Test errored unexpectedly", str(e))))
            print('\n\n')
            continue
    
    if expect_error:
        print('\n'.join((fail_msg, "Test should've errored but didn't")))
        print('\n\n')
        continue
    
    if (np.isclose(got_out, expected_output, equal_nan=True) | (got_out.isnull() & expected_output.isnull())).all().all(): print(pass_msg)
    else:
        print(fail_msg + '\nOutputs unequal!')
        print("Want:")
        print(expected_output)
        
        print("Got:")
        print(got_out)
        print('\n\n')
        continue

Passed Test 0: Empty Input
No known ranges for test_level2
Shape of X :  (6, 3)
Passed Test 1: Multiple Observation with Grouping
No known ranges for test_level2


  {'LEVEL2': 'LEVEL2', 'LEVEL1': 'LEVEL1', 'ITEMID': 'itemid'}, axis=1


Shape of X :  (6, 3)
Passed Test 2: Multiple Observation without Grouping
Shape of X :  (6, 3)
Passed Test 3: Outlier Dectection Applied
test_level2 had 4 / 5 rows cleaned:
  2 rows were strict outliers, set to np.nan
  1 rows were low valid outliers, set to 3.00
  1 rows were high valid outliers, set to 10.00

Shape of X :  (6, 3)
Passed Test 4: Outlier Dectection Not Applied
No known ranges for body
No known ranges for face
Shape of X :  (6, 6)
Passed Test 5: Multiple Level 2 With Grouping
No known ranges for body
No known ranges for face
Shape of X :  (6, 9)
Passed Test 6: Multiple Level 2 No Grouping
No known ranges for body
No known ranges for face
Shape of X :  (6, 6)
Passed Test 7: Missing
No known ranges for weight
Shape of X :  (6, 3)
Passed Test 8: Unit conversion - grouping by level2
No known ranges for weight
Shape of X :  (6, 6)
Passed Test 9: Unit conversion - no grouping


In [33]:
got_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,1,1,1,3,3,3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,weight_oz,weight_oz,weight_oz,weight_kg,weight_kg,weight_kg
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5
1,1,1,0,0.0,,,0.0,,
1,1,1,1,1.0,1.000001,,1.0,10.0,
1,1,1,2,0.0,,,0.0,,
1,1,1,3,1.0,10.000011,,0.0,,
1,1,1,4,0.0,,,0.0,,
1,1,1,5,0.0,,,0.0,,


In [34]:
expected_output

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,1,1,1,3,3,3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,weight_oz,weight_oz,weight_oz,weight_kg,weight_kg,weight_kg
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5
1,1,1,0,0,,,0,,
1,1,1,1,1,1.0,,1,10.0,
1,1,1,2,0,,,0,,
1,1,1,3,1,10.0,,0,,
1,1,1,4,0,,,0,,
1,1,1,5,0,,,0,,
