In [None]:
import gc
import sys
import numpy as np
from pathlib import Path
from datetime import timedelta, datetime
import re

import pandas as pd

sys.path.insert(0, '/src')
from shared.constants import CLIENT, HYPER_PARAMETER_TUNING
from shared.utils import get_client_class, url_encode_cols


 ## =========== Set HYPER_PARAMETER_TUNING in constants.py ===========

In [None]:
clientClass = get_client_class(client=CLIENT)
EXPERIMENT_DATES = getattr(clientClass(), 'get_experiment_dates')()

# starting training from day 31 so that cumsum window 2,7,14,30 are all initial correct.
EXPERIMENT_DATES['train_start_date'] = str((pd.to_datetime(EXPERIMENT_DATES['train_start_date']) +  pd.DateOffset(days=30)).date())

IDEN_COLS = ['censusdate', 'facilityid', 'masterpatientid', 'bedid',
            'censusactioncode', 'payername', 'payercode','to_from_type','client','rth','hosp_lengthofstay', 'LFS']

if not HYPER_PARAMETER_TUNING:
    EXPERIMENT_DATES['train_end_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=2)).strftime('%Y-%m-%d')
    EXPERIMENT_DATES['validation_start_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    
print(CLIENT)
print(HYPER_PARAMETER_TUNING)
print(EXPERIMENT_DATES)

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)
filename = 'final_cleaned_df.parquet'

In [None]:
# final = pd.read_parquet(processed_path/'02-result.parquet')
final = pd.read_parquet(processed_path/f'{filename}')

In [None]:
final = url_encode_cols(final)


In [None]:
# Filter for certain facilities for whom we send report

# print(final.shape)
# final = final.query('facilityid in [5, 7, 10, 21, 9, 1, 6, 8, 3, 13, 4]')
# print(final.shape)

In [None]:
print('Total Admissions', final.query('~dateofadmission.isna()').shape)
print('Total RTHs', final.query('~date_of_transfer.isna()').shape)
print('Total Patient Lines', final.query('hosp_target_3_day_hosp == 1').shape)

In [None]:
# Drop all rows which have same day admission and RTH

index_names = final.query('dateofadmission == date_of_transfer').index
final.drop(index_names, inplace = True) 
final.shape

In [None]:
# Drop all rows which have only 1 census row

final = final.set_index(['masterpatientid'])
index_names = final.groupby('masterpatientid').filter(
    lambda g: len(g) == 1
).index

final.drop(index_names, inplace = True) 
final = final.reset_index()


In [None]:
# Find census day difference for every patient record (current census - previous census)

final = final.sort_values(by=['masterpatientid','censusdate'])
final['census_diff'] = final.groupby(final.masterpatientid,
              as_index=False)['censusdate'].diff()
final['census_diff'] = final['census_diff'].dt.days

In [None]:
# Drop all rows which have RTH and census_diff greater than 1

final = final.set_index(['masterpatientid','censusdate'])
index_names = final.query('(hosp_target_3_day_hosp == 1) & (census_diff > 1)').index
final.drop(index_names, inplace = True) 
final = final.reset_index()

| New Date   | New +ve | old Date   | Actual RTH |    +ve         |
| :---       |  :----: |   ---:     |    ---:    |       ---:     |
| 13-11-2021 | T       | 12-11-2021 |            | T              |
| 14-11-2021 | T       | 13-11-2021 |            | T              |
| 15-11-2021 | T       | 14-11-2021 |            | T              |
| 16-11-2021 | T       | 15-11-2021 |            | T              |
| 17-11-2021 | None    | 16-11-2021 |     T      | T              |

In [None]:
""" We increment the census date by 1, since the prediction day always includes data upto last night.
This means for every census date the data is upto previous night. 
"""
print(final.shape)

# Increment censusdate by 1
final['censusdate'] = (pd.to_datetime(final['censusdate']) + timedelta(days=1))

# Retain RTH days in a separate dataframe
rth_df = final.query('~date_of_transfer.isna()')[['masterpatientid','date_of_transfer']]
rth_df['rth'] = 1

# drop all RTH days, so that we mark a day previous as RTH
index = final.query('~date_of_transfer.isna()').index
final.drop(index, inplace=True)
final.drop(['date_of_transfer'], axis = 1, inplace = True) 

# Add RTH day as an extra indicator column
final = final.merge(
            rth_df,
            how='left',
            left_on=['masterpatientid', 'censusdate'],
            right_on=['masterpatientid', 'date_of_transfer']
        )

final['rth'] = final['rth'].fillna(0)

print(final.shape)

In [None]:
final['LFS'] = final['admissions_days_since_last_admission']

In [None]:
# Drop extra columns

final.drop(
        ['date_of_transfer','dateofadmission','census_diff'],
        axis=1,
        inplace=True
        )

In [None]:
# column processing

final['client'] = final['masterpatientid'].apply(lambda z: z.split('_')[0])
final["facilityid"] = final["client"] + "_" + final["facilityid"].astype(str)

In [None]:
# manually fill in target columns with 0 so we don't also get na indicators for them
final['hosp_target_3_day_hosp'] = final.hosp_target_3_day_hosp.fillna(False)
final['hosp_target_7_day_hosp'] = final.hosp_target_7_day_hosp.fillna(False)

In [None]:
# manual check to make sure we're not including any columns that could leak data
with open('/data/processed/columns.txt','w') as f:
    for col in final.columns:
        f.write(col + '\n')

In [None]:
train = final.loc[final.censusdate < EXPERIMENT_DATES['validation_start_date']]
valid = final.loc[(final.censusdate >= EXPERIMENT_DATES['validation_start_date']) & (final.censusdate <= EXPERIMENT_DATES['validation_end_date'])]
test = final.loc[final.censusdate >= EXPERIMENT_DATES['test_start_date']]

print(final.shape)
print(train.shape)
print(valid.shape)
print(test.shape)
# assert that the sum of rows of the 3 different dataframes is the same as the original
# assert final.shape[0] == (train.shape[0] + valid.shape[0] + test.shape[0])

del final
gc.collect()

In [None]:
# Retain d+1 RTH day

# def remove_dplusone(df):
#     # drop all RTH days, so that we mark a day previous as RTH
#     index = df.query('~rth_dayplusone.isna()').index
#     df.drop(index, inplace=True)
#     df.drop(['rth_dayplusone'], axis = 1, inplace = True)
#     return df

# def retain_dplusone(df):
#     df.drop(['rth_dayplusone'], axis = 1, inplace = True)
#     return df


# train = retain_dplusone(train)
# valid = retain_dplusone(valid)
# test = remove_dplusone(test)

In [None]:
# start of basic tests - assert we have disjoint sets over time
assert train.censusdate.max() < valid.censusdate.min()
assert valid.censusdate.max() < test.censusdate.min()
assert train.hosp_target_3_day_hosp.mean() < train.hosp_target_7_day_hosp.mean()
assert valid.hosp_target_3_day_hosp.mean() < valid.hosp_target_7_day_hosp.mean()
assert test.hosp_target_3_day_hosp.mean() < test.hosp_target_7_day_hosp.mean()
print('Success...')

In [None]:
print(f'Train set covers {train.censusdate.min()} to {train.censusdate.max()} with 3_day_hosp percentage {train.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {train.hosp_target_7_day_hosp.mean()}')
print(f'Valid set covers {valid.censusdate.min()} to {valid.censusdate.max()} with 3_day_hosp percentage {valid.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {valid.hosp_target_7_day_hosp.mean()}')
print(f'Test set covers {test.censusdate.min()} to {test.censusdate.max()} with 3_day_hosp percentage {test.hosp_target_3_day_hosp.mean()} and 7_day_hosp percentage {test.hosp_target_7_day_hosp.mean()}')


In [None]:
def fill_na_train(df):
    """ Get Median value for all columns that contain NaN and all vital columns.
    Store these Median values in a file in S3 & use them when ever a column has NaN during prediction
    """
    nan_cols = []
    nan_medians = []
    for col in df.columns:
        if col in IDEN_COLS or df[col].dtype.name=='category':
            continue
        elif (df[col].isnull().any()) or (col.startswith(("vtl", "hosp"))):
            nan_cols.append(col)
            s = df[col].median()
            nan_medians.append(s)
            df[col].fillna(s,inplace=True)
    d = pd.Series(data=nan_medians, index=nan_cols)
    return df, d

def fill_na_valid_or_test(df, na_filler):
    return df.fillna(na_filler)

In [None]:
%%time

# fill in any remaining na's - now that we're not forwardfilling past info it's not correct to use a global imputation
# hence we impute on the train and apply to the valid and test
# We save these na filler values to use them during predictions

train, na_filler = fill_na_train(train)
valid = fill_na_valid_or_test(valid, na_filler)
test = fill_na_valid_or_test(test, na_filler)

gc.collect()

In [None]:
# Remove the Target values & identification columns
# Keep facilityid in idens and add a duplicate field as facility for featurisation
def prep(df):
    drop_cols = IDEN_COLS + [col for col in df.columns if 'target' in col]

    target_3_day = df.hosp_target_3_day_hosp.astype('float32').values
    target_7_day = df.hosp_target_7_day_hosp.astype('float32').values
    df['facility'] = df['facilityid']    
    x = df.drop(columns=drop_cols).reset_index(drop=True)

    # Convert all columns to float32 & make facility as categorical data
    facility_col = x['facility'].astype('category')
    x = x[x.columns.difference(['facility'])].astype('float32')
    x['facility'] = facility_col
 
    idens = df.loc[:,IDEN_COLS]

    return x, target_3_day, target_7_day, idens

In [None]:
%%time

# Seperate target, x-frame and identification columns
train_x, train_target_3_day, train_target_7_day, train_idens = prep(train)
del train
valid_x, valid_target_3_day, valid_target_7_day, valid_idens = prep(valid)
del valid
test_x, test_target_3_day, test_target_7_day, test_idens = prep(test)
del test

gc.collect()

In [None]:
# make sure for that x's, targets, an idens all have the same # of rows
assert train_x.shape[0] == train_target_3_day.shape[0] == train_target_7_day.shape[0] == train_idens.shape[0]
assert valid_x.shape[0] == valid_target_3_day.shape[0] == valid_target_7_day.shape[0] == valid_idens.shape[0]
assert test_x.shape[0] == test_target_3_day.shape[0] == test_target_7_day.shape[0] == test_idens.shape[0]

# make sure that train, valid, and test have the same # of columns
assert train_x.shape[1] == valid_x.shape[1] == test_x.shape[1]

# make sure that the idens all have the same # of columns
assert train_idens.shape[1] == valid_idens.shape[1] == test_idens.shape[1]

In [None]:
%%time

# Save train, test and validation datasets in local folder

import pickle;
with open(processed_path/'final-train_x.pickle','wb') as f: pickle.dump(train_x, f, protocol=4)
with open(processed_path/'final-train_target_3_day.pickle','wb') as f: pickle.dump(train_target_3_day, f, protocol=4)
with open(processed_path/'final-train_target_7_day.pickle','wb') as f: pickle.dump(train_target_7_day, f, protocol=4)
with open(processed_path/'final-train_idens.pickle','wb') as f: pickle.dump(train_idens, f, protocol=4)

with open(processed_path/'final-valid_x.pickle','wb') as f: pickle.dump(valid_x, f, protocol=4)
with open(processed_path/'final-valid_target_3_day.pickle','wb') as f: pickle.dump(valid_target_3_day, f, protocol=4)
with open(processed_path/'final-valid_target_7_day.pickle','wb') as f: pickle.dump(valid_target_7_day, f, protocol=4)
with open(processed_path/'final-valid_idens.pickle','wb') as f: pickle.dump(valid_idens, f, protocol=4)

with open(processed_path/'final-test_x.pickle','wb') as f: pickle.dump(test_x, f, protocol=4)
with open(processed_path/'final-test_target_3_day.pickle','wb') as f: pickle.dump(test_target_3_day, f, protocol=4)
with open(processed_path/'final-test_target_7_day.pickle','wb') as f: pickle.dump(test_target_7_day, f, protocol=4)
with open(processed_path/'final-test_idens.pickle','wb') as f: pickle.dump(test_idens, f, protocol=4)

with open(processed_path/'final-na_filler.pickle', 'wb') as f: pickle.dump(na_filler, f, protocol=4)

print("--------------Completed--------------")

In [None]:
print(train_x.shape)
print(train_target_3_day.shape)
print(valid_x.shape)
print(valid_target_3_day.shape)
print(test_x.shape)
print(test_target_3_day.shape)