# Add in demographics, other features to Infinity model
Use prior features for convenience. 

In [None]:
import sys
import datetime
import pandas as pd
import pickle as pkl
import numpy as np
import scipy

from pathlib import Path
import gc
import mlflow
from mlflow import log_metric, log_param, log_artifact, set_tag
import lightgbm as lgb 
from typing import Any
from dataclasses import dataclass
from sklearn.metrics import roc_auc_score, average_precision_score

%load_ext autoreload
%autoreload 2

In [None]:
sys.path.append('/src')
from shared import perf_utils
from shared.load_raw_data import load_raw_data_from_files

In [None]:
data_dict = load_raw_data_from_files('/data/raw')

In [None]:
with open("/data/processed/prediction_times.pkl", 'rb') as f_in: 
    prediction_times = pkl.load(f_in)
prediction_times.shape

In [None]:
with open('/data/processed/final_colnames.pkl', 'rb') as f_in: all_feature_names = pkl.load(f_in)
all_features = scipy.sparse.load_npz('/data/processed/final_csr.npz')


In [None]:
# hstack returns a coo matrix, so convert back to csr. 
features = scipy.sparse.csr_matrix(all_features)
features

## Assign target values.
4 day rehosp.

In [None]:
# Assign targets

stays = data_dict['stays']

indices = prediction_times.stayrowindex.values
dates_of_transfer = stays.dateoftransfer.dt.date.values[indices]
prediction_dates = prediction_times.predictiontimestamp.dt.date.values

# substract PredictionTimestamp from DateOfTransfer 
time_diffs = [ transfer_date - predict_date for transfer_date, predict_date in zip(dates_of_transfer, prediction_dates)]
time_diffs_in_days = [diff.days if type(diff) is datetime.timedelta else 1e6 for diff in time_diffs]

time_diffs_in_days = np.array(time_diffs_in_days)
print(np.sum(time_diffs_in_days <= 4))
print(np.mean(time_diffs_in_days <= 4))

mask = (time_diffs_in_days <= 4)
target = np.zeros(len(time_diffs_in_days))
target[mask] = 1
print(np.mean(target))


## Do splits into train, val, test. 

In [None]:
train_start_date = '2017-01-01'
train_end_date = '2019-07-31'
val_end_date = '2019-11-30'
test_end_date = '2020-02-28'

### Deal with lookback period for stays features... 
We do a 90 day look back for the stays features, so we should be getting rid of prediction times with start dates before 2017-01-01 + 90 days... 

In [None]:
stay_indices = prediction_times.stayrowindex.values
start_dates = stays.startdate[stay_indices]
td = pd.to_timedelta('90 days')
min_date = pd.Timestamp(train_start_date) + td
min_date

In [None]:
print('Getting masks')
train_mask = (prediction_times.predictiontimestamp.dt.date.values >= min_date) & \
             (prediction_times.predictiontimestamp.dt.date.values <= pd.to_datetime(train_end_date))
val_mask = (prediction_times.predictiontimestamp.dt.date.values > pd.to_datetime(train_end_date)) & \
           (prediction_times.predictiontimestamp.dt.date.values <= pd.to_datetime(val_end_date))
test_mask = prediction_times.predictiontimestamp.dt.date.values > pd.to_datetime(val_end_date)
np.sum(train_mask), np.sum(val_mask), np.sum(test_mask)

In [None]:
train_x = features[train_mask,:]
train_y = target[train_mask]
train_ptimes = prediction_times[train_mask]
train_ptimes = train_ptimes.reset_index(drop=True)
train_dict = {'X': train_x, 'Y': train_y, 'PredictionTimes': train_ptimes}
print(train_x.shape)

val_x = features[val_mask,:]
val_y = target[val_mask]
val_ptimes = prediction_times[val_mask]
val_ptimes = val_ptimes.reset_index(drop=True)
val_dict = {'X':val_x, 'Y': val_y, 'PredictionTimes': val_ptimes}
print(val_x.shape)

test_x = features[test_mask,:]
test_y = target[test_mask]
test_ptimes = prediction_times[test_mask]
test_ptimes = test_ptimes.reset_index(drop=True)
test_dict = {'X': test_x, 'Y': test_y, 'PredictionTimes': test_ptimes}
print(test_x.shape)

with open('/data/processed/more_features_train_dataset.pkl', 'wb') as f_out: 
    pkl.dump(train_dict, file=f_out)
with open('/data/processed/more_features_val_dataset.pkl', 'wb') as f_out: 
    pkl.dump(val_dict, file=f_out)
with open('/data/processed/more_features_test_dataset.pkl', 'wb') as f_out: 
    pkl.dump(test_dict, file=f_out)

In [None]:
np.mean(train_y), np.mean(val_y), np.mean(test_y)

In [None]:
train_data = lgb.Dataset(train_x, label=(train_y))
valid_data = lgb.Dataset(val_x, label=(val_y))

In [None]:
lgb_params = {
    'boosting': 'gbdt', 
    'objective': 'binary',
    'metric': ['auc', 'binary'],
    'learning_rate': 0.02,
    'n_estimators': 1000,
    'max_depth': -1, 
    'num_threads': 32,
    'num_leaves': 63, 
    'verbose': 3,
    'two_round': True, 
    'early_stopping_rounds': 10,
}

In [None]:
# Create an ML-flow experiment

mlflow.set_tracking_uri('http://mlflow.saiva-dev')
# Experiment name which appears in ML flow 
mlflow.set_experiment('edge-3-day-hosp')

In [None]:
@dataclass
class BaseModel:
    model_name: str
    model_type: str
    model: Any 

base_models = []        

In [None]:
def objective(params):
    print(f'Training LGB models with parameter: {params}')
    with mlflow.start_run():
        run_uuid = mlflow.active_run().info.run_uuid
        for param in params:
            log_param(param, params[param])
        set_tag('model', 'lgb')
        print("=============================Training started...=============================")
        model = lgb.train(params, 
                          train_set=train_data, 
                          valid_sets=[valid_data])
        
        print("=============================Training completed...=============================")
        gc.collect()
        
        # ===========================Predict on valdation dataset=======================
        test_yhat = model.predict(test_x)
        print(test_yhat.shape)
        print("=============================Prediction completed...=============================")
        
        test_auroc = roc_auc_score(test_y, test_yhat)
        test_ap = average_precision_score(test_y, test_yhat)
        saiva_recall = perf_utils.saiva_recall_at_top_K(test_y, test_yhat, test_ptimes)
        recall = perf_utils.recall_at_top_K(test_y, test_yhat, test_ptimes)

        log_metric('Saiva_recall_at_top_15', saiva_recall)
        log_metric('Recall_at_top_15', recall)
        log_metric('Test_set_AUROC', test_auroc)
        log_metric('Test_set_AUPRC', test_ap)
        
        
        base_model = BaseModel(model_name=run_uuid, model_type='lgb', model=model)
        base_models.append(base_model)
        # ================= Save model related artifacts =========================
        with open(f'./{run_uuid}.pickle', 'wb') as f: pkl.dump(base_model, f)
        log_artifact(f'./{run_uuid}.pickle')

        input_features = pd.DataFrame(all_feature_names, columns=['feature'])
        input_features.to_csv(f'./input_features.csv', index=False)
        log_artifact(f'./input_features.csv')

        # =============== Save the code used to training in S3 ======================
        for notebook in list(Path('/src/notebooks').glob('0*.ipynb')):
            log_artifact(str(notebook))
            
        for shared_code in list(Path('/src/shared').glob('*.py')):
            log_artifact(str(shared_code))
            
        for client_code in list(Path('/src/clients').glob('*.py')):
            log_artifact(str(client_code))
            
        # =============== Save the diagnoses & meds used during training ======================
        log_artifact(f'/data/processed/diagnoses_codes.pkl')
        log_artifact(f'/data/processed/med_codes.pkl')
        log_artifact(f'/data/processed/vital_bins.pkl')
        
objective(lgb_params)

In [None]:
stats_by_patient, stats_patient = perf_utils.bootstrap_confidence_intervals(test_y, test_yhat, test_ptimes, B=500, sample_by='patient')
stats_by_patient

### =======================================END==========================================

# Results - ablations
After getting baseline with all new features, take out stays, then notes. These cells are the results only. 

## Results for all features except stay features. 

In [None]:
test_yhat = model.predict(test_x)
print(test_yhat.shape)

from sklearn.metrics import roc_auc_score, average_precision_score

test_auroc = roc_auc_score(test_y, test_yhat)
test_ap = average_precision_score(test_y, test_yhat)
saiva_recall = perf_utils.saiva_recall_at_top_K(test_y, test_yhat, test_ptimes)
recall = perf_utils.recall_at_top_K(test_y, test_yhat, test_ptimes)

print(f"Saiva recall at top 15: {saiva_recall}")
print(f"Recall at top 15: {recall}")
print(f'Test set AUROC: {test_auroc}')
print(f'Test set AUPRC: {test_ap}')

In [None]:
stats_by_patient, stats_patient = perf_utils.bootstrap_confidence_intervals(test_y, test_yhat, test_ptimes, B=200, sample_by='patient')
stats_by_patient

## Results for all features minus notes. 

In [None]:
test_yhat = model.predict(test_x)
print(test_yhat.shape)

from sklearn.metrics import roc_auc_score, average_precision_score

test_auroc = roc_auc_score(test_y, test_yhat)
test_ap = average_precision_score(test_y, test_yhat)
saiva_recall = perf_utils.saiva_recall_at_top_K(test_y, test_yhat, test_ptimes)
recall = perf_utils.recall_at_top_K(test_y, test_yhat, test_ptimes)

print(f"Saiva recall at top 15: {saiva_recall}")
print(f"Recall at top 15: {recall}")
print(f'Test set AUROC: {test_auroc}')
print(f'Test set AUPRC: {test_ap}')

In [None]:
stats_by_patient, stats_patient = perf_utils.bootstrap_confidence_intervals(test_y, test_yhat, test_ptimes, B=500, sample_by='patient')
stats_by_patient

## Results for just stays
Cuz why now see... 

In [None]:
test_yhat = model.predict(test_x)
print(test_yhat.shape)

from sklearn.metrics import roc_auc_score, average_precision_score

test_auroc = roc_auc_score(test_y, test_yhat)
test_ap = average_precision_score(test_y, test_yhat)
saiva_recall = perf_utils.saiva_recall_at_top_K(test_y, test_yhat, test_ptimes)
recall = perf_utils.recall_at_top_K(test_y, test_yhat, test_ptimes)

print(f"Saiva recall at top 15: {saiva_recall}")
print(f"Recall at top 15: {recall}")
print(f'Test set AUROC: {test_auroc}')
print(f'Test set AUPRC: {test_ap}')

In [None]:
stats_by_patient, stats_patient = perf_utils.bootstrap_confidence_intervals(test_y, test_yhat, test_ptimes, B=500, sample_by='patient')
stats_by_patient