In [None]:
! pip install matplotlib

In [None]:
import gc
import json
import pickle
import sys
import timeit
from datetime import timedelta, datetime
from pathlib import Path

import hyperopt
import lightgbm as lgb
import pandas as pd
import numpy as np
from hyperopt import tpe, fmin
from sklearn import metrics
from sklearn.metrics import roc_auc_score, average_precision_score

import mlflow
from mlflow import log_metric, log_param, log_artifact, set_tag

sys.path.insert(0, '/src')
from shared.utils import get_client_class
from dataclasses import dataclass
from typing import Any
import matplotlib.pyplot as plt 


## Load config

In [None]:
from shared.constants import LOCAL_TRAINING_CONFIG_PATH
from shared.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
# JUPYTER NOTEBOOK Magic function
# Write the cell content to a file & execute the cell

from IPython.core.magic import register_cell_magic


@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

In [None]:
EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
CLIENT = "+".join([config.organization_id for config in training_config.ml_model_org_configs])
vector_model = training_config.training_metadata.vector_model
HYPER_PARAMETER_TUNING = training_config.training_metadata.hyper_parameter_tuning

TRAINING_DATA=CLIENT   # trained on which data? e.g. avante + champion
SELECTED_MODEL_VERSION = 'saiva-3-day-hosp-v5'    # e.g. v3, v4 or v6 model

# Name used to filter models in AWS quicksight & also used as ML Flow experiment name
MODEL_DESCRIPTION = f'{CLIENT}-3-day-hosp-v5'   

base_models = []
model_config = {}


# Change train_end_date, validation_start_date for Tuning run
if not HYPER_PARAMETER_TUNING:
    DESCRIPTION_STRING = 'TRAIN+VALID RUN'
    EXPERIMENT_DATES['train_end_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=2)).strftime('%Y-%m-%d')
    EXPERIMENT_DATES['validation_start_date'] = (datetime.strptime(EXPERIMENT_DATES['validation_end_date'], '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
else:
    DESCRIPTION_STRING = 'TUNING RUN'

    
def get_training_runs():
    if HYPER_PARAMETER_TUNING:
        return 9
    else:
        return 1

def get_valid_data():
    if HYPER_PARAMETER_TUNING:
        return [valid_data]
    else:
        return None
    
print(HYPER_PARAMETER_TUNING)    
EXPERIMENT_DATES

In [None]:
%%write_and_run parameterTunningConfig.py
# This cell executes & creates a python file containing the contents of this cell

from hyperopt import hp
global HYPER_PARAMETER_TUNING

lgb_param_space = {
     'application': 'binary',
     'objective': 'binary',
     'metric': 'auc',
     #'boosting_type': hp.choice('boosting_type', ['gbdt']),
     #'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
     #'max_depth': hp.quniform('max_depth', -1, 10, 1),
     #'min_child_samples': 20,
     #'min_child_weight': 0.001,
     #'min_split_gain': 0.0,
     'n_jobs': -1,
    #  'num_leaves': hp.quniform('num_leaves', 30, 300, 1),
     #'subsample': hp.uniform('subsample', 0, 1),
     #'subsample_for_bin': hp.quniform('subsample_for_bin', 200000, 500000, 1000),
     'verbose': 1,
     'is_unbalance': True,
     #'max_bin': hp.quniform('max_bin', 100,1000, 100),
}

# Parameter tunning
if HYPER_PARAMETER_TUNING:
    lgb_param_space['learning_rate'] = hp.uniform('learning_rate', 0.01, 0.05)
    lgb_param_space['n_estimators'] = hp.quniform('n_estimators',108,405,10)
    lgb_param_space['early_stopping_round'] = 10
else:
    lgb_param_space['learning_rate'] = 0.017817765000204273
    lgb_param_space['n_estimators'] = 151
    lgb_param_space['early_stopping_round'] = None
    

## ======================= CONFIG ENDS ==============================

In [None]:
# Create an ML-flow experiment
mlflow.set_tracking_uri('http://mlflow.saiva-dev')

# Experiment name which appears in ML flow
mlflow.set_experiment(MODEL_DESCRIPTION)

EXPERIMENT = mlflow.get_experiment_by_name(MODEL_DESCRIPTION)
MLFLOW_EXPERIMENT_ID = EXPERIMENT.experiment_id

print(f'Experiment ID: {MLFLOW_EXPERIMENT_ID}')

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [None]:
with open(processed_path/'final-train_x.pickle','rb') as f: train_x = pickle.load(f)
with open(processed_path/'final-train_target_3_day.pickle','rb') as f: train_target_3_day = pickle.load(f)
with open(processed_path/'final-train_target_7_day.pickle','rb') as f: train_target_7_day = pickle.load(f)
with open(processed_path/'final-train_idens.pickle','rb') as f: train_idens = pickle.load(f)

with open(processed_path/'final-valid_x.pickle','rb') as f: valid_x = pickle.load(f)
with open(processed_path/'final-valid_target_3_day.pickle','rb') as f: valid_target_3_day = pickle.load(f)
with open(processed_path/'final-valid_target_7_day.pickle','rb') as f: valid_target_7_day = pickle.load(f)
with open(processed_path/'final-valid_idens.pickle','rb') as f: valid_idens = pickle.load(f)

with open(processed_path/'final-test_x.pickle','rb') as f: test_x = pickle.load(f)
with open(processed_path/'final-test_target_3_day.pickle','rb') as f: test_target_3_day = pickle.load(f)
with open(processed_path/'final-test_target_7_day.pickle','rb') as f: test_target_7_day = pickle.load(f)
with open(processed_path/'final-test_idens.pickle','rb') as f: test_idens = pickle.load(f)

with open(processed_path/'final-na_filler.pickle', 'rb') as f: na_filler = pickle.load(f)

In [None]:
print(train_x.shape)
print(train_target_3_day.shape)
print(valid_x.shape)
print(valid_target_3_day.shape)
print(test_x.shape)
print(test_target_3_day.shape)

In [None]:
print('na_fillers column count: ', len(na_filler.keys()))
print('feature column count: ', len(train_x.columns))

In [None]:
train_x.head()

In [None]:
train_data = lgb.Dataset(train_x, label=train_target_3_day)
valid_data = lgb.Dataset(valid_x, label=valid_target_3_day)


In [None]:
@dataclass
class BaseModel:
    model_name: str
    model_type: str
    model: Any


def get_facilities_from_train_data(df):
    return list(df.facilityid.unique())

def get_date_diff(start_date, end_date):
    diff = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days
    return f'{diff} days'

In [None]:
""" Calculate how many transfers were caught up to a particular rank.
hospital_cumsum - how many transfers caught upto a certain rank. Eg: Caught transfers till 10 th rank
Relavant - total transfers per day per facility
"""

def precision_recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.hospitalized_within_pred_range.cumsum()
    group.loc[:, "total_relevant"] = group.hospitalized_within_pred_range.sum()
    group.loc[:, "recall_at_k"] = group.hospitalized_cumsum / group.total_relevant

    return group.reset_index(drop=True)


def performance_base_processing(idens, preds, target_3_day):
    base = idens.copy()
    base['predictionvalue'] = preds
    base['hospitalized_within_pred_range'] = target_3_day
    base['predictionrank'] = base.groupby(['censusdate', 'facilityid']).predictionvalue.rank(ascending=False)
    base = base.sort_values('predictionrank', ascending=True)

    performance_base = (
        base.groupby(["facilityid", "censusdate"])
        .apply(precision_recall_at_k)
        .reset_index(drop=True)
    )

    # Get max rank per facility per census day & then get median among max rank across facility
    # This is calcludated inorder to find the top 10%, 15% value for any given facility
    facility_pats = performance_base.groupby(
        ['censusdate','facilityid']
    ).predictionrank.max().reset_index().groupby(
        'facilityid'
    ).predictionrank.median().reset_index()

    return performance_base, facility_pats

In [None]:
def get_auc(target_3_day, preds):
    total_aucroc = roc_auc_score(target_3_day, preds)
    total_aucroc_25_fpr = roc_auc_score(target_3_day, preds, max_fpr=0.25)
    total_ap = average_precision_score(target_3_day, preds)

    return total_aucroc, total_aucroc_25_fpr, total_ap

def get_pline_recall(performance_base):
    at_rank_15 = performance_base.loc[performance_base.predictionrank == 15]
    pline_recall_at_rank_15 = at_rank_15.hospitalized_cumsum.sum() / at_rank_15.total_relevant.sum()
    
    # Each day we predict 15 patients, total prediciton = 15 * number of census days in the dataset
    pline_precision_at_rank_15 = at_rank_15.hospitalized_cumsum.sum() / (at_rank_15.shape[0] * 15)
    pline_bscore = f_beta_score(pline_precision_at_rank_15, pline_recall_at_rank_15)

    return pline_recall_at_rank_15, pline_precision_at_rank_15, pline_bscore
    
    
def get_rth_recall(performance_base):
    """ If Total RTHs before processing & after processing varies 
    it means there were 2 RTHs within 3 day span. 
    ie. A RTH occured on 18th Nov and another happened on 21st Nov and we have predicted it on 18th Nov
    As part of recall calulation we get credit for both these RTHs
    """
    
    rth_df = performance_base.query('rth == 1')[['censusdate', 'facilityid', 'masterpatientid']]
    print('Total RTHs = ',rth_df.shape[0])
    rth_df.rename(columns={'censusdate': 'rth_censusdate'},
          inplace=True, errors='raise')
    
    df = performance_base.merge(
        rth_df, on=['facilityid', 'masterpatientid']
    )
    df = df[df.censusdate <= df.rth_censusdate]
    df['date_diff'] = (df['rth_censusdate'] - df['censusdate']).dt.days
    df = df[df.date_diff <= 3]
    df['min_predictionrank'] = df.groupby(['rth_censusdate', 'facilityid', 'masterpatientid'])['predictionrank'].transform('min')
    df = df.query('rth == 1')
    print('Total RTHs after processing = ',df.shape[0])
    recall = df.query('min_predictionrank <= 15').shape[0] / df.shape[0]
    print('Total Recall = ',recall)
    
    _df = df.query('LFS <= 30')
    recall_LE30 = _df.query('min_predictionrank <= 15').shape[0] / _df.shape[0]
    
    _df = df.query('LFS > 30')
    recall_G30 = _df.query('min_predictionrank <= 15').shape[0] / _df.shape[0]
    
    return recall, recall_LE30, recall_G30
       
    
def f_beta_score(precision, recall, beta=2):
    return ((1+beta**2)*(precision*recall)) / ((beta**2)*precision + recall)    

In [None]:
def generate_auc_curve(actual_y, preds_y, aucroc, run_id):
    filename = f'auc_curve_{run_id}.png'
    fpr, tpr, thresh = metrics.roc_curve(actual_y, preds_y)
    plt.plot(fpr,tpr)
    plt.xlabel("AUC="+str(aucroc))
    plt.savefig(filename)
    log_artifact(filename)  # Export to MlFlow
    

def get_pos_neg(np_series):
    total = len(np_series)
    neg = np.count_nonzero(np_series==0)
    pos = np.count_nonzero(np_series==1)
#     neg_p = round((neg * 100) / total, 3)
#     pos_p = round((pos * 100) / total, 3)
    return pos, neg, round(neg/pos, 3)

In [None]:
def run_test_set(model, selected_modelid, run_id, test_start_date, test_end_date, x_df, target_3_day, idens):
    
    pos, neg, n2p_ratio = get_pos_neg(target_3_day)
    
    log_param('12_TEST_START_DATE', test_start_date)
    log_param('13_TEST_END_DATE', test_end_date)
    log_param('14_TEST_DURATION', get_date_diff(test_start_date, test_end_date))
    log_param('15_TEST_POS_COUNT', pos)
    log_param('16_TEST_NEG_COUNT', neg)
    log_param('17_TEST_N2P_RATIO', n2p_ratio)
    log_param('18_MODEL_DESCRIPTION', SELECTED_MODEL_VERSION)
    log_param('19_MODEL_ID', selected_modelid)
    
    # ===========================Predict on test dataset=======================
    preds = model.predict(x_df)
    print("=============================Prediction completed...=============================")
    # =========================== TOTAL AUCROC on TEST SET ===========================
    total_aucroc,total_aucroc_25_fpr, total_ap = get_auc(target_3_day, preds)

    performance_test_base, facility_test_pats = performance_base_processing(idens, preds, target_3_day)

    pline_recall_at_rank_15, pline_precision_at_rank_15, pline_bscore = get_pline_recall(
        performance_test_base
    )
    recall, recall_LE30, recall_G30 = get_rth_recall(performance_test_base)
    
    log_metric(f'01_aucroc', total_aucroc)
    log_metric(f'02_RTH_recall_at_rank_15', recall)
    log_metric(f'03_RTH_recall_LOS_LE30_at_rank_15', recall_LE30)
    log_metric(f'04_RTH_recall_LOS_G30_at_rank_15', recall_G30)
    log_metric(f'05_Pline_recall_at_rank_15', pline_recall_at_rank_15)
    log_metric(f'06_Pline_precision_at_rank_15', pline_precision_at_rank_15)
    log_metric(f'07_Pline_bscore_at_rank_15',pline_bscore)
    log_metric(f'08_ap', total_aucroc_25_fpr)
    log_metric(f'09_aucroc_at_.25_fpr', total_ap)
    
    generate_auc_curve(target_3_day, preds, total_aucroc, run_id)
    
    return total_aucroc, pline_recall_at_rank_15

## =================== Model Training ===================

In [None]:
%%time

ITERATION_NUMBER = 1

def objective(params):
    global ITERATION_NUMBER
    print(f'Training LGB models with parameter: {params}')
    with mlflow.start_run():
        start_time = timeit.default_timer()
        run_uuid = mlflow.active_run().info.run_uuid

        # Convert these params to Integer since LGBM excepts it as Int
        params['n_estimators'] = int(params.get('n_estimators'))
        #params['num_leaves'] = int(params.get('num_leaves'))
        #params['max_depth'] = int(params.get('max_depth'))
        #params['subsample_for_bin'] = int(params.get('subsample_for_bin'))
        #params['max_bin'] = int(params.get('max_bin'))
        
        pos, neg, n2p_ratio = get_pos_neg(train_target_3_day)
        facilities = get_facilities_from_train_data(train_idens)
        
        # Log the train, validation & test date ranges
        log_param('00_DESCRIPTION', f'{DESCRIPTION_STRING} {ITERATION_NUMBER}')
        ITERATION_NUMBER += 1
        log_param('02_TRAINING_DATA',TRAINING_DATA)
        log_param('03_FACILITIES', (', '.join(facilities)))
        log_param('04_FACILITIES_COUNT', len(facilities))
        log_param('05_TRAIN_START_DATE', EXPERIMENT_DATES['train_start_date'])
        log_param('06_TRAIN_END_DATE', EXPERIMENT_DATES['train_end_date'])
        log_param('07_TRAIN_DURATION', get_date_diff(EXPERIMENT_DATES['train_start_date'], EXPERIMENT_DATES['train_end_date']))
        log_param('08_TRAIN_POS_COUNT', pos)
        log_param('09_TRAIN_NEG_COUNT', neg)
        log_param('10_TRAIN_N2P_RATIO', n2p_ratio)
        log_param('11_TRAIN_FEATURE_COUNT', train_x.shape[1])

        log_param('001_HYPEROPT_VERSION',hyperopt.__version__)
        log_param('002_LGBM_VERSION',lgb.__version__)

        for param in params:
            log_param(f'hp__{param}', params[param])

        set_tag('model', 'lgb')
        print("=============================Training started...=============================")
        model = lgb.train(params, train_set=train_data, valid_sets=get_valid_data())
        print("=============================Training completed...=============================")
        gc.collect()

        if HYPER_PARAMETER_TUNING:
            # Log validation metric only during tuning
            aucroc, pline_recall_at_rank_15 = run_test_set(
                model,
                run_uuid,
                run_uuid,
                EXPERIMENT_DATES['validation_start_date'],
                EXPERIMENT_DATES['validation_end_date'],
                valid_x,
                valid_target_3_day,
                valid_idens
            )
            log_param('p__best_score', model.best_score['valid_0']['auc'])
            log_param('p__best_iteration', model.best_iteration)
        else:
            # Run testset only on test+valid trained model
            aucroc, pline_recall_at_rank_15 = run_test_set(
                model,
                run_uuid,
                run_uuid,
                EXPERIMENT_DATES['test_start_date'],
                EXPERIMENT_DATES['test_end_date'],
                test_x,
                test_target_3_day,
                test_idens
            )
        
        model_config = {
            'modelid':run_uuid,
            'dayspredictionvalid':3,
            'model_algo': 'lgbm',
            'predictiontask': 'hospitalization',
            'modeldescription' : MODEL_DESCRIPTION,
            'client_data_trained_on' : TRAINING_DATA,
            'vector_model' : vector_model,
            'model_s3_folder': MLFLOW_EXPERIMENT_ID,
            'prospectivedatestart':f'{datetime.now().date()}',
            'training_start_date':EXPERIMENT_DATES['train_start_date'],
            'training_end_date':EXPERIMENT_DATES['train_end_date'],
            'validation_start_date': EXPERIMENT_DATES['validation_start_date'],
            'validation_end_date': EXPERIMENT_DATES['validation_end_date'],
            'test_start_date': EXPERIMENT_DATES['test_start_date'],
            'test_end_date': EXPERIMENT_DATES['test_end_date'],
            'test_auc': str(aucroc),
            'test_recall_at_rank_15': str(pline_recall_at_rank_15),
#             'facility_wise_auc': facility_auc_dict,
        }

        base_models.append(model_config)
        
        # ================= Save model related artifacts =========================
        with open('./model_config.json', 'w') as outfile: json.dump(model_config, outfile)
        log_artifact(f'./model_config.json')
        
        base_model = BaseModel(model_name=run_uuid, model_type='lgb', model=model)
        with open(f'./{run_uuid}.pickle', 'wb') as f: pickle.dump(base_model, f)
        log_artifact(f'./{run_uuid}.pickle')
        
        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv(f'./input_features.csv', index=False)
        log_artifact(f'./input_features.csv')
        
        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')

        # =============== Save the code used to training in S3 ======================
        for notebook in list(Path('/src/notebooks').glob('0*.ipynb')):
            log_artifact(str(notebook))

        for shared_code in list(Path('/src/shared').glob('*.py')):
            log_artifact(str(shared_code))

        for client_code in list(Path('/src/clients').glob('*.py')):
            log_artifact(str(client_code))

        log_artifact('./parameterTunningConfig.py')
        
        t_sec = round(timeit.default_timer() - start_time)
        (t_min, t_sec) = divmod(t_sec,60)
        (t_hour,t_min) = divmod(t_min,60) 
        log_param('01_RUN_TIME', '{}h:{}m:{}s'.format(t_hour,t_min,t_sec))
        
        return 1 - aucroc


best = fmin(fn=objective,
        space=lgb_param_space,
        algo=tpe.suggest,
        max_evals=get_training_runs())

In [None]:
with open('./model_config.json', 'w') as outfile: json.dump(base_models, outfile)

# base_models    

In [None]:
with mlflow.start_run():
    log_param('00_DESCRIPTION', 'UPLOAD LOGS')
    log_artifact('./lgbm_training.log')

## ==================== Load Model from local folder =========================

In [None]:
SELECTED_MODELID = 'ec308e6c819e4ba18f14f27559982070'

# model_path = f'./{modelid}/artifacts/{SELECTED_MODELID}.pickle'
model_path = f'{SELECTED_MODELID}.pickle'

with open(model_path, 'rb') as f:
        model = pickle.load(f)
        model = model.model
        
if not SELECTED_MODELID:
    raise Exception('Configure SELECTED_MODELID')
    exit() 

## ======================= Download Models from S3 =======================

In [None]:
# import subprocess

# modelid = 'fd27a573c801437488e4c2b432205834'

# subprocess.run(
#                 f'aws s3 sync s3://saiva-models/165/{modelid} {modelid}',
#                 shell=True,
#                 stderr=subprocess.DEVNULL,
#                 stdout=subprocess.DEVNULL,
#             )

# print(test_x.shape)
# all_feats = pd.read_csv(
#             f'./{modelid}/artifacts/input_features.csv'
#         )

# test_x = test_x.reindex(columns=all_feats.feature, fill_value=0)
# print(test_x.shape)

## =================== Run TestSet on any loaded Model above ===============

In [None]:
    
# with mlflow.start_run():
#     run_uuid = mlflow.active_run().info.run_uuid
#     log_param('00_DESCRIPTION', 'TEST RUN')
#     log_param('01_CLIENT', TRAINING_DATA)
#     log_param('02_FACILITIES', get_facilities_from_train_data(test_idens))
#     test_aucroc, test_recall_at_rank_15 = run_test_set(model, SELECTED_MODELID, run_uuid)

## =================== List Feature Importance of the model ====================

In [None]:
feature_imp = (
    pd.DataFrame({
        'feature': model.feature_name(),
        'importance': model.feature_importance(importance_type='gain'),  # split
    })
    .sort_values('importance', ascending=False)
)
feature_imp.head(10)

In [None]:
lgb.plot_importance(model, max_num_features=50, figsize=(15,15))

## ================ Run Shap Explanations for Test Set ================

In [None]:
! pip install shap

In [None]:
import shap 

In [None]:
shap_values = shap.TreeExplainer(model).shap_values(test_x)

shap.summary_plot(shap_values, test_x)

In [None]:
# Shap takes lot of time to run across all test dataset. Since certain index and run shap for faster results 

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test_x.iloc[0:50])

shap_results = []

for idx, row in test_x.iloc[0:50].iterrows():
    shaps = pd.DataFrame(
        {
            "feature": test_x.columns,
            "attribution_score": shap_values[1][idx],
            "feature_value": test_x.iloc[idx],
        }
    )

    shaps["masterpatientid"] = test_idens.iloc[idx].masterpatientid
    shaps["facilityid"] = test_idens.iloc[idx].facilityid
    shaps["censusdate"] = test_idens.iloc[idx].censusdate

    shap_results.append(shaps)

results = pd.concat(shap_results)

In [None]:
results.query('attribution_score >= 0.1').sort_values(by=['attribution_score'], ascending=False)['feature'].value_counts().head(25)

