In [None]:
import gc
import sys
import timeit
from datetime import datetime
from pathlib import Path
import json

import lightgbm as lgb
import pandas as pd
from hyperopt import tpe, fmin
from sklearn.metrics import roc_auc_score, average_precision_score

import mlflow
from mlflow import log_metric, log_param, log_artifact, set_tag

sys.path.insert(0, '/src')
from dataclasses import dataclass
from typing import Any

## Load config

In [None]:
from shared.constants import LOCAL_TRAINING_CONFIG_PATH
from shared.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [None]:
import pickle
with open(processed_path/'final-train_x.pickle','rb') as f: train_x = pickle.load(f)
with open(processed_path/'final-train_target_3_day.pickle','rb') as f: train_target_3_day = pickle.load(f)
with open(processed_path/'final-train_target_7_day.pickle','rb') as f: train_target_7_day = pickle.load(f)
with open(processed_path/'final-train_idens.pickle','rb') as f: train_idens = pickle.load(f)

with open(processed_path/'final-valid_x.pickle','rb') as f: valid_x = pickle.load(f)
with open(processed_path/'final-valid_target_3_day.pickle','rb') as f: valid_target_3_day = pickle.load(f)
with open(processed_path/'final-valid_target_7_day.pickle','rb') as f: valid_target_7_day = pickle.load(f)
with open(processed_path/'final-valid_idens.pickle','rb') as f: valid_idens = pickle.load(f)

with open(processed_path/'final-test_x.pickle','rb') as f: test_x = pickle.load(f)
with open(processed_path/'final-test_target_3_day.pickle','rb') as f: test_target_3_day = pickle.load(f)
with open(processed_path/'final-test_target_7_day.pickle','rb') as f: test_target_7_day = pickle.load(f)
with open(processed_path/'final-test_idens.pickle','rb') as f: test_idens = pickle.load(f)

with open(processed_path/'final-na_filler.pickle', 'rb') as f: na_filler = pickle.load(f)

In [None]:
print('na_fillers column count: ', len(na_filler.keys()))
print('feature column count: ', len(train_x.columns))

In [None]:
train_x.head()

In [None]:
train_data = lgb.Dataset(train_x, label=train_target_3_day)
valid_data = lgb.Dataset(valid_x, label=valid_target_3_day)


## ============= Set the Experiment config correctly =============

In [None]:
EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
CLIENT = "+".join([config.organization_id for config in training_config.ml_model_org_configs])
vector_model = training_config.training_metadata.vector_model

base_models = []
MODEL_DESCRIPTION = f'{CLIENT}-3-day-hosp-v3'   # Name used to filter models in AWS quicksight
TRAINING_DATA=CLIENT                            # trained on which data? e.g. avante + champion
experiment_name = f'{CLIENT}-3-day-hosp-v3'     # ML Flow experiment name

@dataclass
class BaseModel:
    model_name: str
    model_type: str
    model: Any

EXPERIMENT_DATES

In [None]:
# Create an ML-flow experiment
mlflow.set_tracking_uri('http://mlflow.saiva-dev')

# Experiment name which appears in ML flow
mlflow.set_experiment(experiment_name)

EXPERIMENT = mlflow.get_experiment_by_name(experiment_name)
MLFLOW_EXPERIMENT_ID = EXPERIMENT.experiment_id

print(f'Experiment ID: {MLFLOW_EXPERIMENT_ID}')

In [None]:
""" Calculate how many transfers were caught up to a particular rank.
hospital_cumsum - how many transfers caught upto a certain rank. Eg: Caught transfers till 10 th rank
Relavant - total transfers per day per facility
"""

def precision_recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.hospitalized_within_pred_range.cumsum()
    group.loc[:, "total_relevant"] = group.hospitalized_within_pred_range.sum()
    group.loc[:, "recall_at_k"] = group.hospitalized_cumsum / group.total_relevant

    return group.reset_index(drop=True)

In [None]:
%%writefile parameterTunningConfig.py
# This cell just creates a python file containing the contents of this cell

from hyperopt import hp

# Parameter tunning
lgb_param_space = {
 'application': 'binary',
 'objective': 'binary',
 'metric': 'auc',
 #'boosting_type': hp.choice('boosting_type', ['gbdt']),
 #'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
 'learning_rate': hp.uniform('learning_rate', 0.001, 0.05),
 #'max_depth': hp.quniform('max_depth', -1, 10, 1),
 #'min_child_samples': 20,
 #'min_child_weight': 0.001,
 #'min_split_gain': 0.0,
 'n_estimators': hp.quniform('n_estimators',108,405,10),
 'n_jobs': -1,
#  'num_leaves': hp.quniform('num_leaves', 30, 300, 1),
 #'subsample': hp.uniform('subsample', 0, 1),
 #'subsample_for_bin': hp.quniform('subsample_for_bin', 200000, 500000, 1000),
 'verbose': 3,
 'is_unbalance': True,
 #'max_bin': hp.quniform('max_bin', 100,1000, 100),
 'early_stopping_round': None,
}

In [None]:
%run parameterTunningConfig.py

# Execute the python file stored earlier

In [None]:
def log_overall_model_auc(target_3_day, preds, dataset_type='valid'):
    total_aucroc = roc_auc_score(target_3_day, preds)
    total_aucroc_25_fpr = roc_auc_score(target_3_day, preds, max_fpr=0.25)
    total_ap = average_precision_score(target_3_day, preds)

    log_metric(f'total_{dataset_type}_aucroc', total_aucroc)
    log_metric(f'total_{dataset_type}_ap', total_aucroc_25_fpr)
    log_metric(f'total_{dataset_type}_aucroc_at_.25_fpr', total_ap)

    return total_aucroc

def performance_base_processing(idens, preds, target_3_day):
    base = idens.copy()
    base['predictionvalue'] = preds
    base['hospitalized_within_pred_range'] = target_3_day
    base['predictionrank'] = base.groupby(['censusdate', 'facilityid']).predictionvalue.rank(ascending=False)
    base = base.sort_values('predictionrank', ascending=True)

    performance_base = (
        base.groupby(["facilityid", "censusdate"])
        .apply(precision_recall_at_k)
        .reset_index(drop=True)
    )

    # Get max rank per facility
    facility_pats = performance_base.groupby(
        ['censusdate','facilityid']
    ).predictionrank.max().reset_index().groupby(
        'facilityid'
    ).predictionrank.median().reset_index()

    return performance_base, facility_pats


In [None]:
def log_facility_wise_test_metrics(test_idens, facility_pats, performance_base):
    total_facility_recall = 0
    excluded_facility_count = 0
    for facilityid in sorted(test_idens.facilityid.unique()):
        rank_subset = performance_base.loc[(performance_base.facilityid==facilityid)]
        facility_15_ranks = rank_subset.loc[rank_subset.predictionrank == 15]
        # Facilities can have 0 transfers for the entire test-set which may result in 0 denominator
        if facility_15_ranks.recall_at_k.count() > 0:
            # Add all the recalls at a facility level
            total_facility_recall += facility_15_ranks.recall_at_k.sum() / facility_15_ranks.recall_at_k.count()
        else:
            excluded_facility_count += 1

    # Substract facilities which have 0 transfers for the entire test-set
    recall = (total_facility_recall/(facility_pats.shape[0]-excluded_facility_count))
    log_metric(f'total_test_recall_at_rank_15', recall)
    return recall

In [None]:
def log_facility_wise_valid_metrics(valid_idens, facility_pats, performance_base, valid_preds, params):
    facility_auc_dict = {}
    total_facility_recall = 0
    excluded_facility_count = 0
    agg_recall_to_optimize = None
    for facilityid in sorted(valid_idens.facilityid.unique()):
        mask = (valid_idens.facilityid == facilityid).values
        k_at_10_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .1).values[0]
        k_at_15_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .15).values[0]
        k_at_20_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .2).values[0]

        rank_subset = performance_base.loc[(performance_base.facilityid==facilityid)]
        try:
            roc_auc = roc_auc_score(valid_target_3_day[mask], valid_preds[mask])

            # Add facility wise AUC only if the facility is part of training client 
            if CLIENT in facilityid:
                fid = int(facilityid.split('_')[1])
                facility_auc_dict[fid] = roc_auc
                
            log_metric(f'facility_{facilityid}_valid_aucroc', roc_auc)
            log_metric(f'facility_{facilityid}_valid_aucroc_at_.25_fpr', roc_auc_score(valid_target_3_day[mask], valid_preds[mask], max_fpr=0.25))
            log_metric(f'facility_{facilityid}_valid_ap', average_precision_score(valid_target_3_day[mask],valid_preds[mask]))

            # We know the total tranfers & caught tranfers upto a certain rank.
            # Recall can be calculated as sum of all transfers upto a certian rank for entire given date range divided
            # by sum of total tranfers per day for the entire given date range
            facility_predicted_sum = rank_subset.loc[rank_subset.predictionrank == k_at_10_percent].hospitalized_cumsum.sum()
            facility_transfered_sum = rank_subset.loc[rank_subset.predictionrank == k_at_10_percent].total_relevant.sum()
            agg_recall_at_10_percent = facility_predicted_sum / facility_transfered_sum
            agg_precision_at_10_percent = facility_predicted_sum / k_at_10_percent

            facility_predicted_sum = rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].hospitalized_cumsum.sum()
            facility_transfered_sum = rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].total_relevant.sum()
            agg_recall_at_15_percent = facility_predicted_sum / facility_transfered_sum
            agg_precision_at_15_percent = facility_predicted_sum / k_at_15_percent

            facility_predicted_sum = rank_subset.loc[rank_subset.predictionrank == k_at_20_percent].hospitalized_cumsum.sum()
            facility_transfered_sum = rank_subset.loc[rank_subset.predictionrank == k_at_20_percent].total_relevant.sum()
            agg_recall_at_20_percent = facility_predicted_sum / facility_transfered_sum
            agg_precision_at_20_percent = facility_predicted_sum / k_at_20_percent

            facility_15_ranks = rank_subset.loc[rank_subset.predictionrank == 15]
            # Facilities can have 0 transfers for the entire test-set which may result in 0 denominator
            if facility_15_ranks.recall_at_k.count() > 0:
                # Add all the recalls at a facility level
                total_facility_recall += facility_15_ranks.recall_at_k.sum() / facility_15_ranks.recall_at_k.count()
            else:
                excluded_facility_count += 1

            log_metric(f'facility_{facilityid}_agg_recall_at_10_percent', agg_recall_at_10_percent)
            log_metric(f'facility_{facilityid}_agg_b-score_at_10_percent',
                       f_beta_score(agg_precision_at_10_percent, agg_recall_at_10_percent))
            log_metric(f'facility_{facilityid}_agg_recall_at_15_percent', agg_recall_at_15_percent)
            log_metric(f'facility_{facilityid}_agg_b-score_at_15_percent',
                       f_beta_score(agg_precision_at_15_percent, agg_recall_at_15_percent))
            log_metric(f'facility_{facilityid}_agg_recall_at_20_percent', agg_recall_at_20_percent)
            log_metric(f'facility_{facilityid}_agg_b-score_at_20_percent',
                       f_beta_score(agg_precision_at_20_percent, agg_recall_at_20_percent))

            if params.get('facility_to_optimize_for') == facilityid:
                agg_recall_to_optimize = agg_recall_at_15_percent

        except Exception as e:
            # workaround for infinity-benchmark because you cannot calculate facility level
            # metric for one facility.  This workaround will just skip calculating that
            # facility level metric - it will print the exception, but continue
            print(e)
            continue

    # divide total_facility_recall by total facilities
    log_metric(f'total_valid_recall_at_rank_15', (total_facility_recall/(facility_pats.shape[0]-excluded_facility_count)))

    return agg_recall_to_optimize, facility_auc_dict

In [None]:
def f_beta_score(precision, recall, beta=2):
    return ((1+beta**2)*(precision*recall)) / ((beta**2)*precision + recall)

In [None]:
%%time

def objective(params):
    print(f'Training LGB models with parameter: {params}')
    with mlflow.start_run():
        run_uuid = mlflow.active_run().info.run_uuid

        #params['num_leaves'] = int(params.get('num_leaves'))
        params['n_estimators'] = int(params.get('n_estimators'))
        #params['max_depth'] = int(params.get('max_depth'))
        #params['subsample_for_bin'] = int(params.get('subsample_for_bin'))
        #params['max_bin'] = int(params.get('max_bin'))

        # Log the train, validation & test date ranges
        log_param('TRAIN_START_DATE', EXPERIMENT_DATES['train_start_date'])
        log_param('TRAIN_END_DATE', EXPERIMENT_DATES['train_end_date'])
        log_param('VALIDATION_START_DATE', EXPERIMENT_DATES['validation_start_date'])
        log_param('VALIDATION_END_DATE', EXPERIMENT_DATES['validation_end_date'])
        log_param('TEST_START_DATE', EXPERIMENT_DATES['test_start_date'])
        log_param('TEST_END_DATE', EXPERIMENT_DATES['test_end_date'])

        for param in params:
            log_param(param, params[param])

        set_tag('model', 'lgb')
        print("=============================Training started...=============================")
        model = lgb.train(params, train_set=train_data, valid_sets=[valid_data])

        print("=============================Training completed...=============================")
        gc.collect()

        # ===========================Predict on valdation dataset=======================
        valid_preds = model.predict(valid_x)
        # ===========================Predict on test dataset=======================
        test_preds = model.predict(test_x)
        print("=============================Prediction completed...=============================")

        # =========================== TOTAL AUCROC on VALIDATION SET ===========================
        total_valid_aucroc = log_overall_model_auc(valid_target_3_day, valid_preds, dataset_type='valid')
        # =========================== TOTAL AUCROC on TEST SET ===========================
        total_test_aucroc = log_overall_model_auc(test_target_3_day, test_preds, dataset_type='test')

        performance_valid_base, facility_valid_pats = performance_base_processing(valid_idens, valid_preds, valid_target_3_day)
        performance_test_base, facility_test_pats = performance_base_processing(test_idens, test_preds, test_target_3_day)

        agg_recall_to_optimize, facility_auc_dict = log_facility_wise_valid_metrics(
            valid_idens,
            facility_valid_pats,
            performance_valid_base,
            valid_preds,
            params
        )
        total_test_recall = log_facility_wise_test_metrics(
            test_idens,
            facility_test_pats,
            performance_test_base
        )
        
        model_config = {
            'modelid':run_uuid,
            'dayspredictionvalid':3,
            'model_algo': 'lgbm',
            'predictiontask': 'hospitalization',
            'modeldescription' : MODEL_DESCRIPTION,
            'client_data_trained_on' : TRAINING_DATA,
            'vector_model' : vector_model,
            'model_s3_folder': MLFLOW_EXPERIMENT_ID,
            'prospectivedatestart':f'{datetime.now().date()}',
            'training_start_date':EXPERIMENT_DATES['train_start_date'],
            'training_end_date':EXPERIMENT_DATES['train_end_date'],
            'validation_start_date': EXPERIMENT_DATES['validation_start_date'],
            'validation_end_date': EXPERIMENT_DATES['validation_end_date'],
            'test_start_date': EXPERIMENT_DATES['test_start_date'],
            'test_end_date': EXPERIMENT_DATES['test_end_date'],
            'test_auc': total_test_aucroc,
            'test_recall_at_rank_15': total_test_recall,
            'facility_wise_auc': facility_auc_dict,
        }
        
        base_models.append(model_config)
        
        # ================= Save model related artifacts =========================
        with open('./model_config.json', 'w') as outfile: json.dump(model_config, outfile)
        log_artifact(f'./model_config.json')
        
        base_model = BaseModel(model_name=run_uuid, model_type='lgb', model=model)
        with open(f'./{run_uuid}.pickle', 'wb') as f: pickle.dump(base_model, f)
        log_artifact(f'./{run_uuid}.pickle')
        
        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv(f'./input_features.csv', index=False)
        log_artifact(f'./input_features.csv')
        
        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')

        # =============== Save the code used to training in S3 ======================
        for notebook in list(Path('/src/notebooks').glob('0*.ipynb')):
            log_artifact(str(notebook))

        for shared_code in list(Path('/src/shared').glob('*.py')):
            log_artifact(str(shared_code))

        for client_code in list(Path('/src/clients').glob('*.py')):
            log_artifact(str(client_code))

        log_artifact('./parameterTunningConfig.py')

        if agg_recall_to_optimize is not None:
            return 1 - agg_recall_to_optimize
        else:
            return 1 - total_valid_aucroc

start_time = timeit.default_timer()
best = fmin(fn=objective,
        space=lgb_param_space,
        algo=tpe.suggest,
        max_evals=9)
print(f"==============Time taken for training {timeit.default_timer() - start_time}======================")

In [None]:
with open('./model_config.json', 'w') as outfile: json.dump(base_models, outfile)
    
base_models    