In [1]:
import pandas as pd
import pickle
import re
import numpy as np
from sklearn.model_selection import train_test_split

# Data Preprocessing
## Load data

In [2]:
baseline_data = pd.read_pickle('../data/baseline_data.pkl')
grouped_data = pd.read_pickle('../data/grouped_data.pkl')

In [3]:
X_baseline = baseline_data.drop(columns=['SSI','PNEUMO','UTI','SEPSIS'])
y_baseline_SSI = baseline_data['SSI']

X_grouped = grouped_data.drop(columns=['SSI','PNEUMO','UTI','SEPSIS'])
y_grouped_SSI = grouped_data['SSI']

## Split into Train and Validation

In [4]:
split_size = 0.3

X_base_trn, X_base_tst, y_base_trn, y_base_tst = train_test_split(X_baseline, y_baseline_SSI, test_size=split_size, 
                                                                  stratify=y_baseline_SSI, random_state=1)

X_base_trn = X_base_trn.reset_index(drop=True)
X_base_tst = X_base_tst.reset_index(drop=True)


X_group_trn, X_group_tst, y_group_trn, y_group_tst = train_test_split(X_grouped, y_grouped_SSI, 
                                                                      test_size=split_size, stratify=y_grouped_SSI,
                                                                      random_state=2)

X_group_trn = X_group_trn.reset_index(drop=True)
X_group_tst = X_group_tst.reset_index(drop=True)

## Outliers
Identify outliers and set to highest/lowest value.

Find values less or greater than 6 std from mean. Change these to the min/max value (that is not 6 std from mean).

In [5]:
def create_std_data(lab_data):
    mean = lab_data.mean()
    mean.name = 'mean'

    std = lab_data.std()
    std.name = 'std'

    gran_agg = pd.concat([mean, std], axis=1)
    gran_agg['std+6'] = gran_agg['mean'] + gran_agg['std'] * 6
    gran_agg['std-6'] = gran_agg['mean'] - gran_agg['std'] * 6
    return(gran_agg)

In [6]:
lab_base_cols = [col for col in X_base_trn.columns if re.search(r'LAB', col)]
lab_base_trn = X_base_trn[lab_base_cols]
lab_base_tst = X_base_tst[lab_base_cols]

lab_group_cols = [col for col in X_group_trn.columns if re.search(r'LAB', col)]
lab_group_trn = X_group_trn[lab_group_cols]
lab_group_tst = X_group_tst[lab_group_cols]

In [7]:
# Baseline outliers
agg_data = create_std_data(lab_base_trn)

for i in range(len(agg_data.index)):
    column_name = agg_data.index[i]
    
    # Max outliers
    std6 = agg_data.loc[agg_data.index == column_name, 'std+6'].values[0]
    
    out_trn_index = lab_base_trn.loc[lab_base_trn[column_name] > std6].index.tolist()
    max_value = lab_base_trn[column_name].drop(axis=0, index=out_trn_index).max()
    
    out_tst_index = lab_base_tst.loc[lab_base_tst[column_name] > std6].index.tolist()

    for ind in out_trn_index:
        X_base_trn.at[ind, column_name] = max_value
    for ind in out_tst_index:
        X_base_tst.at[ind, column_name] = max_value
        
    # Min outliers
    std6 = agg_data.loc[agg_data.index == column_name, 'std-6'].values[0]
    
    out_trn_index = lab_base_trn.loc[lab_base_trn[column_name] < std6].index.tolist()
    min_value = lab_base_trn[column_name].drop(axis=0, index=out_trn_index).min()
    
    out_tst_index = lab_base_tst.loc[lab_base_tst[column_name] < std6].index.tolist()

    for ind in out_trn_index:
        X_base_trn.at[ind, column_name] = min_value
    for ind in out_tst_index:
        X_base_tst.at[ind, column_name] = min_value
    

In [8]:
# Grouped outliers
agg_data = create_std_data(lab_group_trn)

for i in range(len(agg_data.index)):
    column_name = agg_data.index[i]
    
    # Max outliers
    std6 = agg_data.loc[agg_data.index == column_name, 'std+6'].values[0]
    
    out_trn_index = lab_group_trn.loc[lab_group_trn[column_name] > std6].index.tolist()
    max_value = lab_group_trn[column_name].drop(axis=0, index=out_trn_index).max()
    
    out_tst_index = lab_group_tst.loc[lab_group_tst[column_name] > std6].index.tolist()

    for ind in out_trn_index:
        X_group_trn.at[ind, column_name] = max_value
    for ind in out_tst_index:
        X_group_tst.at[ind, column_name] = max_value
        
    # Min outliers
    std6 = agg_data.loc[agg_data.index == column_name, 'std-6'].values[0]
    
    out_trn_index = lab_group_trn.loc[lab_group_trn[column_name] < std6].index.tolist()
    min_value = lab_group_trn[column_name].drop(axis=0, index=out_trn_index).min()
    
    out_tst_index = lab_group_tst.loc[lab_group_tst[column_name] < std6].index.tolist()

    for ind in out_trn_index:
        X_group_trn.at[ind, column_name] = min_value
    for ind in out_tst_index:
        X_group_tst.at[ind, column_name] = min_value
    

## Imputation
Find medians of labs in training set and impute missing labs from training and validation sets with these.

In [9]:
def get_lab_medians(X_train):
    lab_cols = [col for col in X_train.columns if re.search(r'LAB_', col)]
    lab_medians = dict(X_train[lab_cols].median(axis=0))
    return(lab_medians)

def save_lab_medians(lab_medians, fname):
    with open('../data/medians/%s_lab_medians.pkl' %fname,'wb') as f:
        pickle.dump(lab_medians, f)

In [10]:
SSI_base_medians = get_lab_medians(X_base_trn)
save_lab_medians(SSI_base_medians, 'SSI_baseline')
X_base_trn = X_base_trn.fillna(SSI_base_medians)
X_base_tst = X_base_tst.fillna(SSI_base_medians)

SSI_group_medians = get_lab_medians(X_group_trn)
save_lab_medians(SSI_group_medians, 'SSI_grouped')
X_group_trn = X_group_trn.fillna(SSI_group_medians)
X_group_tst = X_group_tst.fillna(SSI_group_medians)

## Save dataframes
Save dataframes as pickle files.

In [11]:
X_base_trn.to_pickle('../data/split_sets/X_baseline_train_SSI.pkl')
X_base_tst.to_pickle('../data/split_sets/X_baseline_test_SSI.pkl')
y_base_trn.to_pickle('../data/split_sets/y_baseline_train_SSI.pkl')
y_base_tst.to_pickle('../data/split_sets/y_baseline_test_SSI.pkl')

X_group_trn.to_pickle('../data/split_sets/X_grouped_train_SSI.pkl')
X_group_tst.to_pickle('../data/split_sets/X_grouped_test_SSI.pkl')
y_group_trn.to_pickle('../data/split_sets/y_grouped_train_SSI.pkl')
y_group_tst.to_pickle('../data/split_sets/y_grouped_test_SSI.pkl')