In [None]:
import pandas as pd
import re
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler

In [None]:
train_data = pd.read_pickle('../../data/train_long_data.pkl')
outcomes = pd.read_pickle('../../data/SSI_outcomes.pkl')

# Data Prep for Feature Selection

Make a wide aggregated dataframe for feature selection.

In [None]:
numerical = train_data.loc[train_data['TERMINOLOGY'] == 'LOINC'].copy()

median = numerical[['PT_KEY','FEATURE','VALUE']].groupby(['PT_KEY','FEATURE']).median().reset_index()
median['FEATURE'] = median['FEATURE'] + '_MEDIAN'
median = median.pivot_table(values='VALUE', index='PT_KEY', columns='FEATURE').reset_index()

mean = numerical[['PT_KEY','FEATURE','VALUE']].groupby(['PT_KEY','FEATURE']).mean().reset_index()
mean['FEATURE'] = mean['FEATURE'] + '_MEAN'
mean = mean.pivot_table(values='VALUE', index='PT_KEY', columns='FEATURE').reset_index()

minimum = numerical[['PT_KEY','FEATURE','VALUE']].groupby(['PT_KEY','FEATURE']).min().reset_index()
minimum['FEATURE'] = minimum['FEATURE'] + '_MIN'
minimum = minimum.pivot_table(values='VALUE', index='PT_KEY', columns='FEATURE').reset_index()

maximum = numerical[['PT_KEY','FEATURE','VALUE']].groupby(['PT_KEY','FEATURE']).max().reset_index()
maximum['FEATURE'] = maximum['FEATURE'] + '_MAX'
maximum = maximum.pivot_table(values='VALUE', index='PT_KEY', columns='FEATURE').reset_index()

In [None]:
# Impute with medians
train_medians = pd.concat([median.median(numeric_only=True), 
                           mean.median(numeric_only=True), 
                           minimum.median(numeric_only=True), 
                           maximum.median(numeric_only=True)])

# train_medians.to_pickle('../../data/Time_Series_Dataset/wide_agg_medians.pkl') # These are only aggregated medians.

In [None]:
categorical = train_data.loc[train_data['TERMINOLOGY'] != 'LOINC'].copy()

count = categorical[['PT_KEY','FEATURE','VALUE']].groupby(['PT_KEY','FEATURE']).sum().reset_index()
count = count.pivot_table(values='VALUE', index='PT_KEY', columns='FEATURE').reset_index()

In [None]:
agg_train = train_data[['PT_KEY']].drop_duplicates().merge(count, how='left', on='PT_KEY')
agg_train = agg_train.merge(median, how='left', on='PT_KEY')
agg_train = agg_train.merge(mean, how='left', on='PT_KEY')
agg_train = agg_train.merge(minimum, how='left', on='PT_KEY')
agg_train = agg_train.merge(maximum, how='left', on='PT_KEY')

agg_train = agg_train.fillna(train_medians.to_dict())
agg_train = agg_train.fillna(0)

agg_train = agg_train.merge(outcomes, how='inner', on='PT_KEY')

### Scale Data
Use MinMaxScaler (normalize values between 0 and 1) since data is not necessarily normal.

In [None]:
scaler = MinMaxScaler()
scaler.fit(agg_train.drop(columns=['PT_KEY','SSI']))
scaled_agg_train = pd.DataFrame(scaler.transform(agg_train.drop(columns=['PT_KEY','SSI'])), 
                                columns = [col for col in agg_train.columns if col not in ['PT_KEY','SSI']])
scaled_agg_train = pd.concat([agg_train[['PT_KEY','SSI']], scaled_agg_train], axis=1)

In [None]:
# scaled_agg_train.to_pickle('../../data/train_wide_agg_scaled.pkl')