In [None]:
import pandas as pd
import re
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold

In [None]:
train_data = pd.read_pickle('../../data/train_long_data.pkl')
validation_data = pd.read_pickle('../../data/val_long_data.pkl')
test_data = pd.read_pickle('../../data/test_long_data.pkl')
outcomes = pd.read_pickle('../../data/SSI_outcomes.pkl')

with open('../../data/feature_selection_50_columns.pkl', 'rb') as f:
    keep_columns = pickle.load(f)

# Data Prep for RNN

## Processing Functions

In [None]:
# Aggregate time steps by days
def isolate_categorical_features(data, time_agg = 'DAYS'):
    categorical = data.loc[data['TERMINOLOGY'] != 'LOINC'].copy()

    count = categorical[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE','VALUE']].groupby(['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE']).sum().reset_index()
    count = count.pivot_table(values='VALUE', index=['PT_KEY', 'TIME_AFTER_OP_%s' % time_agg], columns='FEATURE').reset_index()
    return count

def isolate_numeric_features(data, time_agg = 'DAYS'):
    numerical = data.loc[data['TERMINOLOGY'] == 'LOINC'].copy()

    median = numerical[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE','VALUE']].groupby(['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE']).median().reset_index()
    median['FEATURE'] = median['FEATURE'] + '_MEDIAN'
    median = median.pivot_table(values='VALUE', index=['PT_KEY','TIME_AFTER_OP_%s' % time_agg], columns='FEATURE').reset_index()

    mean = numerical[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE','VALUE']].groupby(['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE']).mean().reset_index()
    mean['FEATURE'] = mean['FEATURE'] + '_MEAN'
    mean = mean.pivot_table(values='VALUE', index=['PT_KEY','TIME_AFTER_OP_%s' % time_agg], columns='FEATURE').reset_index()

    minimum = numerical[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE','VALUE']].groupby(['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE']).min().reset_index()
    minimum['FEATURE'] = minimum['FEATURE'] + '_MIN'
    minimum = minimum.pivot_table(values='VALUE', index=['PT_KEY','TIME_AFTER_OP_%s' % time_agg], columns='FEATURE').reset_index()

    maximum = numerical[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE','VALUE']].groupby(['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'FEATURE']).max().reset_index()
    maximum['FEATURE'] = maximum['FEATURE'] + '_MAX'
    maximum = maximum.pivot_table(values='VALUE', index=['PT_KEY','TIME_AFTER_OP_%s' % time_agg], columns='FEATURE').reset_index()

    return median, mean, minimum, maximum

def create_agg_by_time_data(data, selected_columns, time_agg = 'DAYS'):
    count = isolate_categorical_features(data, time_agg)
    median, mean, minimum, maximum = isolate_numeric_features(data, time_agg)
    
    agg_data = data[['PT_KEY','TIME_AFTER_OP_%s' % time_agg]].drop_duplicates().merge(count, how='left', on=['PT_KEY','TIME_AFTER_OP_%s' % time_agg])
    agg_data = agg_data.merge(median, how='left', on=['PT_KEY','TIME_AFTER_OP_%s' % time_agg])
    agg_data = agg_data.merge(mean, how='left', on=['PT_KEY','TIME_AFTER_OP_%s' % time_agg])
    agg_data = agg_data.merge(minimum, how='left', on=['PT_KEY','TIME_AFTER_OP_%s' % time_agg])
    agg_data = agg_data.merge(maximum, how='left', on=['PT_KEY','TIME_AFTER_OP_%s' % time_agg])
    
    agg_data = pd.concat([agg_data[['PT_KEY','TIME_AFTER_OP_%s' % time_agg]], agg_data[selected_columns]], axis=1)
    
    return agg_data

# Fill NAs
def fill_NAs(data, medians):
    data = data.fillna(medians.to_dict())
    data = data.fillna(0)
    return data

# Scale data
def get_scaled_data(data, scaler, time_agg):
    scaled_data = pd.DataFrame(scaler.transform(data.drop(columns=['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'SSI'])), 
                               columns = [col for col in data.columns if col not in ['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'SSI']])
    scaled_data = pd.concat([data[['PT_KEY','TIME_AFTER_OP_%s' % time_agg,'SSI']], scaled_data], axis=1)
    return scaled_data

# Transform to 3D array
def make_3D_array(data, time_agg, max_value):
    pt_list = data['PT_KEY'].drop_duplicates().values

    array_list = []
    for pt_key in pt_list:
        temp_array = data.loc[data['PT_KEY'] == pt_key].copy().drop(columns=['PT_KEY','SSI']).values
        if temp_array.shape[0] > max_value:
            temp_array = temp_array[:max_value]
        else:
            temp_array = np.pad(temp_array, [(max_value-temp_array.shape[0], 0), (0,0)], 'constant', constant_values=-1)
        array_list.append(temp_array)
    return np.array(array_list)
    
# Get outcome data as numpy array
def get_label_array(data, max_value):
    label_data = data[['PT_KEY','SSI']].drop_duplicates()
    
    array_list = []
    for label in label_data['SSI'].values:
        array_list.append(np.repeat(label, max_value))
    labels = np.array(array_list)
    
    return labels

# Train Data Prep
N = 6430

In [None]:
trn_data = create_agg_by_time_data(train_data, keep_columns, 'DAYS')
# trn_data = create_agg_by_time_data(train_data, keep_columns, 'HR')
medians = trn_data[[col for col in trn_data.columns if re.search('LOINC', col)]].median()
trn_data = fill_NAs(trn_data, medians)
trn_data = trn_data.merge(outcomes, how='left', on='PT_KEY')

### Scale Data
Use MinMaxScaler (normalize values between 0 and 1) since data is not necessarily normal 

Save scaler from training data to apply to testing data.

In [None]:
scaler = MinMaxScaler()
scaler.fit(trn_data.drop(columns=['PT_KEY','TIME_AFTER_OP_DAYS','SSI']))
# scaler.fit(trn_data.drop(columns=['PT_KEY','TIME_AFTER_OP_HR','SSI']))

trn_data = get_scaled_data(trn_data, scaler, 'DAYS')
# trn_data = get_scaled_data(trn_data, scaler, 'HR')

### Split training data into 10 stratified folds

Used for cross validation in tuning. 

In [None]:
cv_data = trn_data[['PT_KEY','SSI']].drop_duplicates().reset_index(drop=True)
cv = StratifiedKFold(n_splits=10, shuffle=True)

fold_ids = {'train':{}, 'val':{}}
fold = 0
for train_index, val_index in cv.split(cv_data['PT_KEY'], cv_data['SSI']):
    fold += 1
    fold_train_data = cv_data.iloc[train_index]
    fold_ids['train'][fold] = fold_train_data['PT_KEY'].values
    fold_val_data = cv_data.iloc[val_index]
    fold_ids['val'][fold] = fold_val_data['PT_KEY'].values

In [None]:
trn_fold_1 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][1])].copy()
trn_fold_2 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][2])].copy()
trn_fold_3 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][3])].copy()
trn_fold_4 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][4])].copy()
trn_fold_5 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][5])].copy()
trn_fold_6 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][6])].copy()
trn_fold_7 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][7])].copy()
trn_fold_8 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][8])].copy()
trn_fold_9 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][9])].copy()
trn_fold_10 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['train'][10])].copy()

val_fold_1 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][1])].copy()
val_fold_2 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][2])].copy()
val_fold_3 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][3])].copy()
val_fold_4 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][4])].copy()
val_fold_5 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][5])].copy()
val_fold_6 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][6])].copy()
val_fold_7 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][7])].copy()
val_fold_8 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][8])].copy()
val_fold_9 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][9])].copy()
val_fold_10 = trn_data.loc[trn_data['PT_KEY'].isin(fold_ids['val'][10])].copy()

### Transform into 3D numpy array

Final step would be to coerce into 3D array: each operative event on y-axis, each feature on x-axis, and each time on z-axis

Operative events do not all have the same number of days, need to pad 

In [None]:
max_value = trn_data[['PT_KEY','TIME_AFTER_OP_DAYS']].groupby('PT_KEY').count().max().values[0]
# max_value = trn_data[['PT_KEY','TIME_AFTER_OP_HR']].groupby('PT_KEY').count().max().values[0]

train_X = make_3D_array(trn_data, 'DAYS', max_value)
# train_X = make_3D_array(trn_data, 'HR', max_value)

In [None]:
train_X_1 = make_3D_array(trn_fold_1, 'DAYS', max_value)
train_X_2 = make_3D_array(trn_fold_2, 'DAYS', max_value)
train_X_3 = make_3D_array(trn_fold_3, 'DAYS', max_value)
train_X_4 = make_3D_array(trn_fold_4, 'DAYS', max_value)
train_X_5 = make_3D_array(trn_fold_5, 'DAYS', max_value)
train_X_6 = make_3D_array(trn_fold_6, 'DAYS', max_value)
train_X_7 = make_3D_array(trn_fold_7, 'DAYS', max_value)
train_X_8 = make_3D_array(trn_fold_8, 'DAYS', max_value)
train_X_9 = make_3D_array(trn_fold_9, 'DAYS', max_value)
train_X_10 = make_3D_array(trn_fold_10, 'DAYS', max_value)

val_X_1 = make_3D_array(val_fold_1, 'DAYS', max_value)
val_X_2 = make_3D_array(val_fold_2, 'DAYS', max_value)
val_X_3 = make_3D_array(val_fold_3, 'DAYS', max_value)
val_X_4 = make_3D_array(val_fold_4, 'DAYS', max_value)
val_X_5 = make_3D_array(val_fold_5, 'DAYS', max_value)
val_X_6 = make_3D_array(val_fold_6, 'DAYS', max_value)
val_X_7 = make_3D_array(val_fold_7, 'DAYS', max_value)
val_X_8 = make_3D_array(val_fold_8, 'DAYS', max_value)
val_X_9 = make_3D_array(val_fold_9, 'DAYS', max_value)
val_X_10 = make_3D_array(val_fold_10, 'DAYS', max_value)

# train_X_1 = make_3D_array(trn_fold_1, 'HR', max_value)
# train_X_2 = make_3D_array(trn_fold_2, 'HR', max_value)
# train_X_3 = make_3D_array(trn_fold_3, 'HR', max_value)
# train_X_4 = make_3D_array(trn_fold_4, 'HR', max_value)
# train_X_5 = make_3D_array(trn_fold_5, 'HR', max_value)
# train_X_6 = make_3D_array(trn_fold_6, 'HR', max_value)
# train_X_7 = make_3D_array(trn_fold_7, 'HR', max_value)
# train_X_8 = make_3D_array(trn_fold_8, 'HR', max_value)
# train_X_9 = make_3D_array(trn_fold_9, 'HR', max_value)
# train_X_10 = make_3D_array(trn_fold_10, 'HR', max_value)

# val_X_1 = make_3D_array(val_fold_1, 'HR', max_value)
# val_X_2 = make_3D_array(val_fold_2, 'HR', max_value)
# val_X_3 = make_3D_array(val_fold_3, 'HR', max_value)
# val_X_4 = make_3D_array(val_fold_4, 'HR', max_value)
# val_X_5 = make_3D_array(val_fold_5, 'HR', max_value)
# val_X_6 = make_3D_array(val_fold_6, 'HR', max_value)
# val_X_7 = make_3D_array(val_fold_7, 'HR', max_value)
# val_X_8 = make_3D_array(val_fold_8, 'HR', max_value)
# val_X_9 = make_3D_array(val_fold_9, 'HR', max_value)
# val_X_10 = make_3D_array(val_fold_10, 'HR', max_value)

### Label Data

Getting label data for both many to one and many to many models.

In [None]:
train_y = get_label_array(trn_data, max_value)

In [None]:
train_y_1 = get_label_array(trn_fold_1, max_value)
train_y_2 = get_label_array(trn_fold_2, max_value)
train_y_3 = get_label_array(trn_fold_3, max_value)
train_y_4 = get_label_array(trn_fold_4, max_value)
train_y_5 = get_label_array(trn_fold_5, max_value)
train_y_6 = get_label_array(trn_fold_6, max_value)
train_y_7 = get_label_array(trn_fold_7, max_value)
train_y_8 = get_label_array(trn_fold_8, max_value)
train_y_9 = get_label_array(trn_fold_9, max_value)
train_y_10 = get_label_array(trn_fold_10, max_value)

val_y_1 = get_label_array(val_fold_1, max_value)
val_y_2 = get_label_array(val_fold_2, max_value)
val_y_3 = get_label_array(val_fold_3, max_value)
val_y_4 = get_label_array(val_fold_4, max_value)
val_y_5 = get_label_array(val_fold_5, max_value)
val_y_6 = get_label_array(val_fold_6, max_value)
val_y_7 = get_label_array(val_fold_7, max_value)
val_y_8 = get_label_array(val_fold_8, max_value)
val_y_9 = get_label_array(val_fold_9, max_value)
val_y_10 = get_label_array(val_fold_10, max_value)


### Save Train Data

In [None]:
print('Whole Training Set')
print('Features: %s' %str(train_X.shape))
print('Labels: %s' %str(train_y.shape))

# medians.to_pickle('../../data/medians.pkl')
# with open('../../data/scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)
# trn_data.to_pickle('../../data/train_scaled.pkl')

# with open('../../data/train_X.pkl', 'wb') as f:
#     pickle.dump(train_X, f)
# with open('../../data/train_y.pkl', 'wb') as f:
#     pickle.dump(train_y, f)

In [None]:
print('Cross Validation Folds:')
print('Fold 1')
print('Train Features: %s' %str(train_X_1.shape))
print('Train Labels: %s' %str(train_y_1.shape))
print('Validation Features: %s' %str(val_X_1.shape))
print('Validation Labels: %s' %str(val_y_1.shape))

print('Fold 2')
print('Train Features: %s' %str(train_X_2.shape))
print('Train Labels: %s' %str(train_y_2.shape))
print('Validation Features: %s' %str(val_X_2.shape))
print('Validation Labels: %s' %str(val_y_2.shape))

print('Fold 3')
print('Train Features: %s' %str(train_X_3.shape))
print('Train Labels: %s' %str(train_y_3.shape))
print('Validation Features: %s' %str(val_X_3.shape))
print('Validation Labels: %s' %str(val_y_3.shape))

print('Fold 4')
print('Train Features: %s' %str(train_X_4.shape))
print('Train Labels: %s' %str(train_y_4.shape))
print('Validation Features: %s' %str(val_X_4.shape))
print('Validation Labels: %s' %str(val_y_4.shape))

print('Fold 5')
print('Train Features: %s' %str(train_X_5.shape))
print('Train Labels: %s' %str(train_y_5.shape))
print('Validation Features: %s' %str(val_X_5.shape))
print('Validation Labels: %s' %str(val_y_5.shape))

print('Fold 6')
print('Train Features: %s' %str(train_X_6.shape))
print('Train Labels: %s' %str(train_y.shape))
print('Validation Features: %s' %str(val_X_6.shape))
print('Validation Labels: %s' %str(val_y_6.shape))

print('Fold 7')
print('Train Features: %s' %str(train_X_7.shape))
print('Train Labels: %s' %str(train_y_7.shape))
print('Validation Features: %s' %str(val_X_7.shape))
print('Validation Labels: %s' %str(val_y_7.shape))

print('Fold 8')
print('Train Features: %s' %str(train_X_8.shape))
print('Train Labels: %s' %str(train_y_8.shape))
print('Validation Features: %s' %str(val_X_8.shape))
print('Validation Labels: %s' %str(val_y_8.shape))

print('Fold 9')
print('Train Features: %s' %str(train_X_9.shape))
print('Train Labels: %s' %str(train_y_9.shape))
print('Validation Features: %s' %str(val_X_9.shape))
print('Validation Labels: %s' %str(val_y_9.shape))

print('Fold 10')
print('Train Features: %s' %str(train_X_10.shape))
print('Train Labels: %s' %str(train_y_10.shape))
print('Validation Features: %s' %str(val_X_10.shape))
print('Validation Labels: %s' %str(val_y_10.shape))


In [None]:
# with open('../../data/cv_data/train_1_X.pkl', 'wb') as f:
#     pickle.dump(train_X_1, f)
# with open('../../data/cv_data/train_1_y.pkl', 'wb') as f:
#     pickle.dump(train_y_1, f)
    
# with open('../../data/cv_data/train_2_X.pkl', 'wb') as f:
#     pickle.dump(train_X_2, f)
# with open('../../data/cv_data/train_2_y.pkl', 'wb') as f:
#     pickle.dump(train_y_2, f)
    
# with open('../../data/cv_data/train_3_X.pkl', 'wb') as f:
#     pickle.dump(train_X_3, f)
# with open('../../data/cv_data/train_3_y.pkl', 'wb') as f:
#     pickle.dump(train_y_3, f)
    
# with open('../../data/cv_data/train_4_X.pkl', 'wb') as f:
#     pickle.dump(train_X_4, f)
# with open('../../data/cv_data/train_4_y.pkl', 'wb') as f:
#     pickle.dump(train_y_4, f)
    
# with open('../../data/cv_data/train_5_X.pkl', 'wb') as f:
#     pickle.dump(train_X_5, f)
# with open('../../data/cv_data/train_5_y.pkl', 'wb') as f:
#     pickle.dump(train_y_5, f)
    
# with open('../../data/cv_data/train_6_X.pkl', 'wb') as f:
#     pickle.dump(train_X_6, f)
# with open('../../data/cv_data/train_6_y.pkl', 'wb') as f:
#     pickle.dump(train_y_6, f)
    
# with open('../../data/cv_data/train_7_X.pkl', 'wb') as f:
#     pickle.dump(train_X_7, f)
# with open('../../data/cv_data/train_7_y.pkl', 'wb') as f:
#     pickle.dump(train_y_7, f)
    
# with open('../../data/cv_data/train_8_X.pkl', 'wb') as f:
#     pickle.dump(train_X_8, f)
# with open('../../data/cv_data/train_8_y.pkl', 'wb') as f:
#     pickle.dump(train_y_8, f)
    
# with open('../../data/cv_data/train_9_X.pkl', 'wb') as f:
#     pickle.dump(train_X_9, f)
# with open('../../data/cv_data/train_9_y.pkl', 'wb') as f:
#     pickle.dump(train_y_9, f)
    
# with open('../../data/cv_data/train_10_X.pkl', 'wb') as f:
#     pickle.dump(train_X_10, f)
# with open('../../data/cv_data/train_10_y.pkl', 'wb') as f:
#     pickle.dump(train_y_10, f)


In [None]:
# with open('../../data/cv_data/val_1_X.pkl', 'wb') as f:
#     pickle.dump(val_X_1, f)
# with open('../../data/cv_data/val_1_y.pkl', 'wb') as f:
#     pickle.dump(val_y_1, f)
    
# with open('../../data/cv_data/val_2_X.pkl', 'wb') as f:
#     pickle.dump(val_X_2, f)
# with open('../../data/cv_data/val_2_y.pkl', 'wb') as f:
#     pickle.dump(val_y_2, f)
    
# with open('../../data/cv_data/val_3_X.pkl', 'wb') as f:
#     pickle.dump(val_X_3, f)
# with open('../../data/cv_data/val_3_y.pkl', 'wb') as f:
#     pickle.dump(val_y_3, f)
    
# with open('../../data/cv_data/val_4_X.pkl', 'wb') as f:
#     pickle.dump(val_X_4, f)
# with open('../../data/cv_data/val_4_y.pkl', 'wb') as f:
#     pickle.dump(val_y_4, f)
    
# with open('../../data/cv_data/val_5_X.pkl', 'wb') as f:
#     pickle.dump(val_X_5, f)
# with open('../../data/cv_data/val_5_y.pkl', 'wb') as f:
#     pickle.dump(val_y_5, f)
    
# with open('../../data/cv_data/val_6_X.pkl', 'wb') as f:
#     pickle.dump(val_X_6, f)
# with open('../../data/cv_data/val_6_y.pkl', 'wb') as f:
#     pickle.dump(val_y_6, f)
    
# with open('../../data/cv_data/val_7_X.pkl', 'wb') as f:
#     pickle.dump(val_X_7, f)
# with open('../../data/cv_data/val_7_y.pkl', 'wb') as f:
#     pickle.dump(val_y_7, f)
    
# with open('../../data/cv_data/val_8_X.pkl', 'wb') as f:
#     pickle.dump(val_X_8, f)
# with open('../../data/cv_data/val_8_y.pkl', 'wb') as f:
#     pickle.dump(val_y_8, f)
    
# with open('../../data/cv_data/val_9_X.pkl', 'wb') as f:
#     pickle.dump(val_X_9, f)
# with open('../../data/cv_data/val_9_y.pkl', 'wb') as f:
#     pickle.dump(val_y_9, f)
    
# with open('../../data/cv_data/val_10_X.pkl', 'wb') as f:
#     pickle.dump(val_X_10, f)
# with open('../../data/cv_data/val_10_y.pkl', 'wb') as f:
#     pickle.dump(val_y_10, f)

# Validation Data Prep 
N = 918

Use medians, scaler, and max_value from train

In [None]:
val_data = create_agg_by_time_data(validation_data, keep_columns, 'DAYS')
# val_data = create_agg_by_time_data(validation_data, keep_columns, 'HR')
val_data = fill_NAs(val_data, medians)
val_data = val_data.merge(outcomes, how='left', on='PT_KEY')

### Scale Data

In [None]:
val_data = get_scaled_data(val_data, scaler, 'DAYS')
# val_data = get_scaled_data(val_data, scaler, 'HR')

### Transform into 3D numpy array

In [None]:
val_X = make_3D_array(val_data, 'DAYS', max_value)
# val_X = make_3D_array(val_data, 'HR', max_value)

### Label Data

In [None]:
val_y = get_label_array(val_data, max_value)

### Save Data

In [None]:
print('Validation Set')
print('Features: %s' %str(val_X.shape))
print('Labels: %s' %str(val_y.shape))

val_data.to_pickle('../../data/val_scaled.pkl')

with open('../../data/val_X.pkl', 'wb') as f:
    pickle.dump(val_X, f)
with open('../../data/val_y.pkl', 'wb') as f:
    pickle.dump(val_y_m2o, f)

# Test Data Prep 
N = 1837

Use medians, scaler, and max_value from train

In [None]:
tst_data = create_agg_by_time_data(test_data, keep_columns, 'DAYS')
# tst_data = create_agg_by_time_data(test_data, keep_columns, 'HR')
tst_data = fill_NAs(tst_data, medians)
tst_data = tst_data.merge(outcomes, how='left', on='PT_KEY')

### Scale Data

In [None]:
tst_data = get_scaled_data(tst_data, scaler, 'DAYS')
# tst_data = get_scaled_data(tst_data, scaler, 'HR')

### Transform into 3D numpy array

In [None]:
test_X = make_3D_array(tst_data, 'DAYS', max_value)
# test_X = make_3D_array(tst_data, 'HR', max_value)

### Label Data

In [None]:
test_y = get_label_array(tst_data, max_value)

### Id Data

In [None]:
id_data = tst_data['PT_KEY'].drop_duplicates().values

### Save Test Data

In [None]:
print('Testing Set')
print('Features: %s' %str(test_X.shape))
print('Labels: %s' %str(test_y.shape))

# tst_data.to_pickle('../../data/test_scaled.pkl')

# with open('../../data/test_X.pkl', 'wb') as f:
#     pickle.dump(test_X, f)
# with open('../../data/test_y.pkl', 'wb') as f:
#     pickle.dump(test_y, f)
# with open('../../data/test_ids.pkl', 'wb') as f:
#     pickle.dump(id_data, f)