In [None]:
import gc
import sys
import numpy as np
from pathlib import Path
from datetime import timedelta, datetime
import re

import pandas as pd

from saiva.model.shared.constants import MODEL_TYPE
from saiva.model.shared.utils import get_client_class, url_encode_cols


## Load config

In [None]:
from saiva.model.shared.constants import LOCAL_TRAINING_CONFIG_PATH
from saiva.training.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
MODEL_TYPE = MODEL_TYPE.lower()
print('MODEL:', MODEL_TYPE)

 ## =========== Set HYPER_PARAMETER_TUNING in constants.py ===========

In [None]:
CLIENT = "+".join([config.organization_id for config in training_config.organization_configs])

EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
HYPER_PARAMETER_TUNING = training_config.training_metadata.hyper_parameter_tuning

# starting training from day 31 so that cumsum window 2,7,14,30 are all initial correct.
# One day will be added to `censusdate` later in the code, so that the first date in
# `train` will be `EXPERIMENT_DATES['train_start_date'] + 1 day`, that's why here we
# add 31 days but not 30
EXPERIMENT_DATES['train_start_date'] = str((pd.to_datetime(EXPERIMENT_DATES['train_start_date']) +  pd.DateOffset(days=31)).date())

if not HYPER_PARAMETER_TUNING:
    EXPERIMENT_DATES['train_end_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=2)).strftime('%Y-%m-%d')
    EXPERIMENT_DATES['validation_start_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    
print(CLIENT)
print(HYPER_PARAMETER_TUNING)
print(EXPERIMENT_DATES)

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)
filename = 'final_cleaned_df.parquet'

In [None]:
final = pd.read_parquet(processed_path/f'{filename}')

In [None]:
final = url_encode_cols(final)

In [None]:
assert f'target_3_day_{MODEL_TYPE}' in final.columns, f"There is no target for training `{MODEL_TYPE}` model"

In [None]:
IDEN_COLS = ['censusdate', 'facilityid', 'masterpatientid', 'LFS', 'primaryphysicianid',
         'payername', 'to_from_type', 'client', 'admissionstatus',
         f'positive_date_{MODEL_TYPE}']

# UPT model doesn't need the rows that with payername contains 'hospice'
if MODEL_TYPE=='model_upt':
    final = final[~(final['payername'].str.contains('hospice', case=False, regex=True, na=False))]

In [None]:
# column processing
final['client'] = final['masterpatientid'].apply(lambda z: z.split('_')[0])
final["facilityid"] = final["client"] + "_" + final["facilityid"].astype(str)

In [None]:
final['LFS'] = final['days_since_last_admission']

In [None]:
""" We increment the census date by 1, since the prediction day always includes data upto last night.
This means for every census date the data is upto previous night. 
"""
print(final.shape)

# Increment censusdate by 1
final['censusdate'] = (pd.to_datetime(final['censusdate']) + timedelta(days=1))

print(final.shape)

In [None]:
def drop_unwanted_columns(df):   
    positive_date = f'positive_date_{MODEL_TYPE}'
    target_3_day = f'target_3_day_{MODEL_TYPE}'
    drop_columns = ['dateofadmission']
    dates = list(df.columns[df.columns.str.contains('positive_date_')])
    targets = list(df.columns[df.columns.str.contains('target_3_day_')])
    for date in dates: 
        if date!=positive_date:
            drop_columns.append(date)
    for target in targets:
        if target!= target_3_day:
            drop_columns.append(target)
    df = df.drop(columns=drop_columns, errors='ignore')
    return df

# drop unwanted columns for this model
final = drop_unwanted_columns(final)
    
final[f'target_3_day_{MODEL_TYPE}'] = final[f'target_3_day_{MODEL_TYPE}'].fillna(False)

In [None]:
%%time
train = final.loc[(final.censusdate >= EXPERIMENT_DATES['train_start_date']) & (final.censusdate <= EXPERIMENT_DATES['train_end_date'])]
valid = final.loc[(final.censusdate >= EXPERIMENT_DATES['validation_start_date']) & (final.censusdate <= EXPERIMENT_DATES['validation_end_date'])]
test = final.loc[final.censusdate >= EXPERIMENT_DATES['test_start_date']]

def sort_group(group):
    return group.sort_values('masterpatientid')

valid = valid.groupby(['facilityid', 'censusdate']).apply(sort_group)
valid.reset_index(drop=True)

test = test.groupby(['facilityid', 'censusdate']).apply(sort_group)
test.reset_index(drop=True, inplace=True)

print(final.shape)
print(train.shape)
print(valid.shape)
print(test.shape)

del final
gc.collect()

In [None]:
for col in train.columns:
    if 'target_3_day' in col:
        print(col)

In [None]:
print(f'train - target_3_day_{MODEL_TYPE}', train[f'target_3_day_{MODEL_TYPE}'].value_counts())
print(f'valid - target_3_day_{MODEL_TYPE}', valid[f'target_3_day_{MODEL_TYPE}'].value_counts())
print(f'test - target_3_day_{MODEL_TYPE}', test[f'target_3_day_{MODEL_TYPE}'].value_counts())

In [None]:
# start of basic tests - assert we have disjoint sets over time
assert train.censusdate.max() < valid.censusdate.min()
assert valid.censusdate.max() < test.censusdate.min()
print('Success...')

In [None]:
print(f'Train set covers {train.censusdate.min()} to {train.censusdate.max()} with 3_day_{MODEL_TYPE} percentage {train[f"target_3_day_{MODEL_TYPE}"].mean()}')
print(f'Valid set covers {valid.censusdate.min()} to {valid.censusdate.max()} with 3_day_{MODEL_TYPE} percentage {valid[f"target_3_day_{MODEL_TYPE}"].mean()}')
print(f'Test set covers {test.censusdate.min()} to {test.censusdate.max()} with 3_day_{MODEL_TYPE} percentage {test[f"target_3_day_{MODEL_TYPE}"].mean()}')


In [None]:
for col in train.columns:
    if train[col].dtypes=='datetime64[ns]':
        if col not in IDEN_COLS:
            print(col, train[col].dtypes)

In [None]:
# Remove the Target values & identification columns
# Keep facilityid in idens and add a duplicate field as facility for featurisation
def prep(df, feature_names=None, category_columns=None, pandas_categorical=None):
    df.reset_index(drop=True, inplace=True)
    drop_cols = IDEN_COLS + [col for col in df.columns if 'target' in col]
    drop_cols += [col for col in df.columns if 'positive_date_' in col]

    target_3_day = df[f'target_3_day_{MODEL_TYPE}'].astype('float32').values
    
    df['facility'] = df['facilityid']  
    df['facility'] = df['facility'].astype('category')
    
    x = df.drop(columns=drop_cols).reset_index(drop=True)
    
    if feature_names is None:
        feature_names = x.columns.tolist()
    elif (len(x.columns) != len(feature_names)):
        raise ValueError("train and valid dataset feature names do not match")
    elif (x.columns != feature_names).any():
        x = x.reindex(columns=feature_names)     
    
    if category_columns is None:
        category_columns = x.dtypes[x.dtypes == 'category'].index.tolist()
    
    if pandas_categorical is None:
        
        pandas_categorical = [list(x[col].cat.categories) for col in category_columns]
  
    else:
        if len(category_columns) != len(pandas_categorical):
            raise ValueError("train and valid dataset categorical_feature do not match")
        for col, category in zip(category_columns, pandas_categorical):
            if list(x[col].cat.categories) != list(category):
                x[col] = x[col].cat.set_categories(category)
    
    idens = df.loc[:,IDEN_COLS]
    #add 'long_short_term' column to indens
    short_term_cond = ((x.payertype == 'Managed Care')|(x.payertype == 'Medicare A'))
    idens.loc[short_term_cond,'long_short_term']='short'
    
    nonType_cond = (x.payertype == 'no payer info')
    if nonType_cond.sum()>0:
        print(f'{nonType_cond.sum()} patient days have no payer info')
        idens.loc[nonType_cond,'long_short_term']='no payer info'
    
    idens.loc[~(short_term_cond|nonType_cond),'long_short_term']='long'
    
    # converting to numpy array    
    x[category_columns] = x[category_columns].apply(lambda col: col.cat.codes).replace({-1: np.nan})
    x = x.to_numpy(dtype=np.float32, na_value=np.nan)

    return x, target_3_day, idens, feature_names, category_columns, pandas_categorical

In [None]:
%%time

# Seperate target, x-frame and identification columns
train_x, train_target_3_day, train_idens, feature_names, cate_columns, pandas_categorical = prep(train)
del train
valid_x, valid_target_3_day, valid_idens, _, _, _ = prep(valid, feature_names, cate_columns, pandas_categorical)
del valid
test_x, test_target_3_day, test_idens, _, _, _ = prep(test, feature_names, cate_columns, pandas_categorical)
del test

gc.collect()

In [None]:
# make sure for that x's, targets, an idens all have the same # of rows
assert train_x.shape[0] == train_target_3_day.shape[0] == train_idens.shape[0]
assert valid_x.shape[0] == valid_target_3_day.shape[0] == valid_idens.shape[0]
assert test_x.shape[0] == test_target_3_day.shape[0] == test_idens.shape[0]

# make sure that train, valid, and test have the same # of columns
assert train_x.shape[1] == valid_x.shape[1] == test_x.shape[1]

# make sure that the idens all have the same # of columns
assert train_idens.shape[1] == valid_idens.shape[1] == test_idens.shape[1]

In [None]:
%%time

# Save train, test and validation datasets in local folder

import pickle
with open(processed_path/f'final-train_x_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(train_x, f, protocol=4)
with open(processed_path/f'final-train_target_3_day_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(train_target_3_day, f, protocol=4)
with open(processed_path/f'final-train_idens_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(train_idens, f, protocol=4)

with open(processed_path/f'final-valid_x_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(valid_x, f, protocol=4)
with open(processed_path/f'final-valid_target_3_day_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(valid_target_3_day, f, protocol=4)
with open(processed_path/f'final-valid_idens_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(valid_idens, f, protocol=4)

with open(processed_path/f'final-test_x_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(test_x, f, protocol=4)
with open(processed_path/f'final-test_target_3_day_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(test_target_3_day, f, protocol=4)
with open(processed_path/f'final-test_idens_{MODEL_TYPE}.pickle','wb') as f: pickle.dump(test_idens, f, protocol=4)

with open(processed_path/'cate_columns.pickle', 'wb') as f: pickle.dump(cate_columns, f, protocol=4)
with open(processed_path/'feature_names.pickle', 'wb') as f: pickle.dump(feature_names, f, protocol=4)
with open(processed_path/'pandas_categorical.pickle', 'wb') as f: pickle.dump(pandas_categorical, f, protocol=4)
    
with open('./cate_columns.pickle', 'wb') as f: pickle.dump(cate_columns, f, protocol=4)

print("--------------Completed--------------")

In [None]:
print(train_x.shape)
print(train_target_3_day.shape)
print(valid_x.shape)
print(valid_target_3_day.shape)
print(test_x.shape)
print(test_target_3_day.shape)