In [None]:
import pandas as pd
from pathlib import Path
import timeit

import gc

In [None]:
"""
Dataset are from TRAIN_START_DATE = '2017-01-01' & TRAIN_END_DATE = '2020-03-10'.
We split this to training set, test set & validation set
"""

train_end_date = '2020-09-01'
valid_end_date = '2020-11-29'

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)
filename = '05-result.parquet'

In [None]:
# final = pd.read_parquet(processed_path/'02-result.parquet')
final = pd.read_parquet(processed_path/f'{filename}')

In [None]:
# column processing

final['client'] = final['masterpatientid'].apply(lambda z: z.split('_')[0])
final["facilityid"] = final["client"] + "_" + final["facilityid"].astype(str)

In [None]:
# manually fill in target columns with 0 so we don't also get na indicators for them
final['hosp_target_3_day_hosp'] = final.hosp_target_3_day_hosp.fillna(False)
final['hosp_target_7_day_hosp'] = final.hosp_target_7_day_hosp.fillna(False)

In [None]:
# manual check to make sure we're not including any columns that could leak data
with open('/data/processed/columns.txt','w') as f:
    for col in final.columns:
        f.write(col + '\n')

In [None]:
train = final.loc[final.censusdate <= train_end_date]
valid = final.loc[(final.censusdate > train_end_date) & (final.censusdate <= valid_end_date)]
test = final.loc[final.censusdate > valid_end_date]

print(train.shape)
print(valid.shape)
print(test.shape)

del final
gc.collect()

In [None]:
# start of basic tests - assert we have disjoint sets over time
assert train.censusdate.max() < valid.censusdate.min()
assert valid.censusdate.max() < test.censusdate.min()
assert train.hosp_target_3_day_hosp.mean() < train.hosp_target_7_day_hosp.mean()
assert valid.hosp_target_3_day_hosp.mean() < valid.hosp_target_7_day_hosp.mean()
# assert test.hosp_target_3_day_hosp.mean() < test.hosp_target_7_day_hosp.mean()
print('Success...')

In [None]:
print(f'Train set covers {train.censusdate.min()} to {train.censusdate.max()} with 3_day_hosp percentage {train.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {train.hosp_target_7_day_hosp.mean()}')
print(f'Valid set covers {valid.censusdate.min()} to {valid.censusdate.max()} with 3_day_hosp percentage {valid.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {valid.hosp_target_7_day_hosp.mean()}')
print(f'Test set covers {test.censusdate.min()} to {test.censusdate.max()} with 3_day_hosp percentage {test.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {test.hosp_target_7_day_hosp.mean()}')


In [None]:
def fill_na_train(df):
    """ Get Median value for all columns that contain NaN and all vital columns.
    Store these Median values in a file in S3 & use them when ever a column has NaN during prediction
    """
    nan_cols = [col for col in train.columns if (train[col].isnull().any()) or (col.startswith('vtl'))]
    
    d = df.loc[:, nan_cols].median()
    df = df.fillna(d)
    
    return df, d

def fill_na_valid_or_test(df, na_filler):
    return df.fillna(na_filler)

In [None]:
%%time

# fill in any remaining na's - now that we're not forwardfilling past info it's not correct to use a global imputation
# hence we impute on the train and apply to the valid and test
# We save these na filler values to use them during predictions

train, na_filler = fill_na_train(train)
valid = fill_na_valid_or_test(valid, na_filler)
test = fill_na_valid_or_test(test, na_filler)

gc.collect()

In [None]:
# Remove the Target values & identification columns
def prep(df):
    drop_cols = ['censusdate', 'masterpatientid', 'facilityid', 'bedid', 'client']
    drop_cols = drop_cols + [col for col in df.columns if 'target' in col]

    target_3_day = df.hosp_target_3_day_hosp.astype('float32').values
    target_7_day = df.hosp_target_7_day_hosp.astype('float32').values
    x = df.drop(columns=drop_cols).reset_index(drop=True).astype('float32')
    idens = df.loc[:,['masterpatientid','censusdate', 'facilityid', 'bedid', 'client']]
    
    return x, target_3_day, target_7_day, idens

In [None]:
%%time

# Seperate target, x-frame and identification columns
train_x, train_target_3_day, train_target_7_day, train_idens = prep(train)
del train
valid_x, valid_target_3_day, valid_target_7_day, valid_idens = prep(valid)
del valid
test_x, test_target_3_day, test_target_7_day, test_idens = prep(test)
del test

gc.collect()

In [None]:
# make sure for that x's, targets, an idens all have the same # of rows
assert train_x.shape[0] == train_target_3_day.shape[0] == train_target_7_day.shape[0] == train_idens.shape[0]
assert valid_x.shape[0] == valid_target_3_day.shape[0] == valid_target_7_day.shape[0] == valid_idens.shape[0]
assert test_x.shape[0] == test_target_3_day.shape[0] == test_target_7_day.shape[0] == test_idens.shape[0]

# make sure that train, valid, and test have the same # of columns
assert train_x.shape[1] == valid_x.shape[1] == test_x.shape[1] 

# make sure that the idens all have the same # of columns
assert train_idens.shape[1] == valid_idens.shape[1] == test_idens.shape[1]

In [None]:
%%time

# Save train, test and validation datasets in local folder

import pickle;
with open(processed_path/'final-train_x.pickle','wb') as f: pickle.dump(train_x, f, protocol=4)
with open(processed_path/'final-train_target_3_day.pickle','wb') as f: pickle.dump(train_target_3_day, f, protocol=4)
with open(processed_path/'final-train_target_7_day.pickle','wb') as f: pickle.dump(train_target_7_day, f, protocol=4)
with open(processed_path/'final-train_idens.pickle','wb') as f: pickle.dump(train_idens, f, protocol=4)

with open(processed_path/'final-valid_x.pickle','wb') as f: pickle.dump(valid_x, f, protocol=4)
with open(processed_path/'final-valid_target_3_day.pickle','wb') as f: pickle.dump(valid_target_3_day, f, protocol=4)
with open(processed_path/'final-valid_target_7_day.pickle','wb') as f: pickle.dump(valid_target_7_day, f, protocol=4)
with open(processed_path/'final-valid_idens.pickle','wb') as f: pickle.dump(valid_idens, f, protocol=4)
    
with open(processed_path/'final-test_x.pickle','wb') as f: pickle.dump(test_x, f, protocol=4)
with open(processed_path/'final-test_target_3_day.pickle','wb') as f: pickle.dump(test_target_3_day, f, protocol=4)
with open(processed_path/'final-test_target_7_day.pickle','wb') as f: pickle.dump(test_target_7_day, f, protocol=4)
with open(processed_path/'final-test_idens.pickle','wb') as f: pickle.dump(test_idens, f, protocol=4)
    
with open(processed_path/'final-na_filler.pickle', 'wb') as f: pickle.dump(na_filler, f, protocol=4)
    
print("--------------Completed--------------")