In [None]:
import pandas as pd
from pathlib import Path
import timeit
import gc

from sklearn.metrics import roc_auc_score, average_precision_score
import mlflow
from mlflow import log_metric, log_param, log_artifact, set_tag
import lightgbm as lgb

from hyperopt import tpe, fmin
from dataclasses import dataclass
from typing import Any

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [None]:
import pickle
with open(processed_path/'final-train_x.pickle','rb') as f: train_x = pickle.load(f)
with open(processed_path/'final-train_target_3_day.pickle','rb') as f: train_target_3_day = pickle.load(f)
with open(processed_path/'final-train_target_7_day.pickle','rb') as f: train_target_7_day = pickle.load(f)
with open(processed_path/'final-train_idens.pickle','rb') as f: train_idens = pickle.load(f)

with open(processed_path/'final-valid_x.pickle','rb') as f: valid_x = pickle.load(f)
with open(processed_path/'final-valid_target_3_day.pickle','rb') as f: valid_target_3_day = pickle.load(f)
with open(processed_path/'final-valid_target_7_day.pickle','rb') as f: valid_target_7_day = pickle.load(f)
with open(processed_path/'final-valid_idens.pickle','rb') as f: valid_idens = pickle.load(f)
    
with open(processed_path/'final-test_x.pickle','rb') as f: test_x = pickle.load(f)
with open(processed_path/'final-test_target_3_day.pickle','rb') as f: test_target_3_day = pickle.load(f)
with open(processed_path/'final-test_target_7_day.pickle','rb') as f: test_target_7_day = pickle.load(f)
with open(processed_path/'final-test_idens.pickle','rb') as f: test_idens = pickle.load(f)
    
with open(processed_path/'final-na_filler.pickle', 'rb') as f: na_filler = pickle.load(f)

In [None]:
train_idens

In [None]:
train_x.head()

In [None]:
train_data = lgb.Dataset(train_x, label=train_target_3_day)
valid_data = lgb.Dataset(valid_x, label=valid_target_3_day)


## ============= Set the Experiment name correctly ============= 

In [None]:
# Create an ML-flow experiment

mlflow.set_tracking_uri('http://mlflow.saiva-dev')
# Experiment name which appears in ML flow 
mlflow.set_experiment('trio_v2_only_notes_change')

In [None]:
def precision_recall_at_k(group):
    group.loc[:, "hospitalized_cumsum"] = group.hospitalized_within_pred_range.cumsum()
    group.loc[:, "total_relevant"] = group.hospitalized_within_pred_range.sum()
    group.loc[:, "recall_at_k"] = group.hospitalized_cumsum / group.total_relevant

    return group.reset_index(drop=True)

In [None]:
%%writefile parameterTunningConfig.py
# This cell just creates a python file containing the contents of this cell

from hyperopt import hp

# Parameter tunning
lgb_param_space = {
 'application': 'binary',
 'objective': 'binary',
 'metric': 'auc',
 #'boosting_type': hp.choice('boosting_type', ['gbdt']),
 #'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
 'learning_rate': hp.uniform('learning_rate', 0.001, 0.05),
 #'max_depth': hp.quniform('max_depth', -1, 10, 1),
 #'min_child_samples': 20,
 #'min_child_weight': 0.001,
 #'min_split_gain': 0.0,
 'n_estimators': hp.quniform('n_estimators',100,400,10),
 'n_jobs': -1,
#  'num_leaves': hp.quniform('num_leaves', 30, 300, 1),
 #'subsample': hp.uniform('subsample', 0, 1),
 #'subsample_for_bin': hp.quniform('subsample_for_bin', 200000, 500000, 1000),
 'verbose': 3,
 'is_unbalance': hp.choice('is_unbalance', [True, False]),
 #'max_bin': hp.quniform('max_bin', 100,1000, 100),
 'early_stopping_round': None,
}

In [None]:
%run parameterTunningConfig.py

# Execute the python file stored earlier 

In [None]:
@dataclass
class BaseModel:
    model_name: str
    model_type: str
    model: Any
    

In [None]:
base_models = []

In [None]:
def f_beta_score(precision, recall, beta=2):
    return ((1+beta**2)*(precision*recall)) / ((beta**2)*precision + recall)

In [None]:
%%time

def objective(params):
    print(f'Training LGB models with parameter: {params}')
    with mlflow.start_run():
        run_uuid = mlflow.active_run().info.run_uuid
        
        #params['num_leaves'] = int(params.get('num_leaves'))
        params['n_estimators'] = int(params.get('n_estimators'))
        #params['max_depth'] = int(params.get('max_depth'))
        #params['subsample_for_bin'] = int(params.get('subsample_for_bin'))
        #params['max_bin'] = int(params.get('max_bin'))

        for param in params:
            log_param(param, params[param])

        set_tag('model', 'lgb')
        print("=============================Training started...=============================")
        model = lgb.train(params, train_set=train_data, valid_sets=[valid_data])
        
        print("=============================Training completed...=============================")
        gc.collect()

        # ===========================Predict on valdation dataset=======================
        valid_preds = model.predict(valid_x)
        print("=============================Prediction completed...=============================")
        
        total_valid_aucroc = roc_auc_score(valid_target_3_day, valid_preds)
        total_valid_aucroc_25_fpr = roc_auc_score(valid_target_3_day, valid_preds, max_fpr=0.25)
        total_valid_ap = average_precision_score(valid_target_3_day, valid_preds)
        agg_recall_to_optimize = None

        log_metric('total_valid_aucroc', total_valid_aucroc)
        log_metric('total_valid_ap', total_valid_ap)
        log_metric('total_valid_aucroc_at_.25_fpr', total_valid_aucroc_25_fpr)

        valid_base = valid_idens.copy()
        valid_base['predictionvalue'] = valid_preds
        valid_base['hospitalized_within_pred_range'] = valid_target_3_day
        valid_base['predictionrank'] = valid_base.groupby(['censusdate', 'facilityid']).predictionvalue.rank(ascending=False)
        valid_base = valid_base.sort_values('predictionrank', ascending=True)

        performance_base = (
            valid_base.groupby(["facilityid", "censusdate"])
            .apply(precision_recall_at_k)
            .reset_index(drop=True)
        )
        
        facility_pats = performance_base.groupby(['censusdate','facilityid']
                        ).predictionrank.max().reset_index().groupby('facilityid').predictionrank.median().reset_index()

        
        for facilityid in sorted(valid_idens.facilityid.unique()):
            mask = (valid_idens.facilityid == facilityid).values
            k_at_10_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .1).values[0]
            k_at_15_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .15).values[0]
            k_at_20_percent = round(facility_pats.loc[facility_pats.facilityid == facilityid].predictionrank * .2).values[0]

            rank_subset = performance_base.loc[(performance_base.facilityid==facilityid)]
            try:
                
                log_metric(f'facility_{facilityid}_valid_aucroc', roc_auc_score(valid_target_3_day[mask], valid_preds[mask]))
                log_metric(f'facility_{facilityid}_valid_aucroc_at_.25_fpr', roc_auc_score(valid_target_3_day[mask], valid_preds[mask], max_fpr=0.25))
                log_metric(f'facility_{facilityid}_valid_ap', average_precision_score(valid_target_3_day[mask],valid_preds[mask]))

                agg_recall_at_10_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_10_percent].hospitalized_cumsum.sum() / rank_subset.loc[rank_subset.predictionrank == k_at_10_percent].total_relevant.sum()
                )
                agg_precision_at_10_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_10_percent].hospitalized_cumsum.sum() / k_at_10_percent)
                
                agg_recall_at_15_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].hospitalized_cumsum.sum() / rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].total_relevant.sum()
                )
                
                agg_precision_at_15_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_15_percent].hospitalized_cumsum.sum() / k_at_15_percent)

                agg_recall_at_20_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_20_percent].hospitalized_cumsum.sum() / rank_subset.loc[rank_subset.predictionrank == k_at_20_percent].total_relevant.sum()
                )
                
                agg_precision_at_20_percent = (
                    rank_subset.loc[rank_subset.predictionrank == k_at_20_percent].hospitalized_cumsum.sum() / k_at_20_percent
                )
                
                log_metric(f'facility_{facilityid}_agg_recall_at_10_percent', agg_recall_at_10_percent)
                log_metric(f'facility_{facilityid}_agg_b-score_at_10_percent', 
                           f_beta_score(agg_precision_at_10_percent, agg_recall_at_10_percent))
                log_metric(f'facility_{facilityid}_agg_recall_at_15_percent', agg_recall_at_15_percent)
                log_metric(f'facility_{facilityid}_agg_b-score_at_15_percent', 
                           f_beta_score(agg_precision_at_15_percent, agg_recall_at_15_percent))
                log_metric(f'facility_{facilityid}_agg_recall_at_20_percent', agg_recall_at_20_percent)
                log_metric(f'facility_{facilityid}_agg_b-score_at_20_percent', 
                           f_beta_score(agg_precision_at_20_percent, agg_recall_at_20_percent))
                
                if params.get('facility_to_optimize_for') == facilityid:
                    agg_recall_to_optimize = agg_recall_at_15_percent
                
            except Exception as e:
                # workaround for infinity-benchmark because you cannot calculate facility level
                # metric for one facility.  This workaround will just skip calculating that
                # facility level metric - it will print the exception, but continue
                print(e)
                continue


        base_model = BaseModel(model_name=run_uuid, model_type='lgb', model=model)
        base_models.append(base_model)
        
        # ================= Save model related artifacts =========================
        with open(f'./{run_uuid}.pickle', 'wb') as f: pickle.dump(base_model, f)
        log_artifact(f'./{run_uuid}.pickle')

        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv(f'./input_features.csv', index=False)
        log_artifact(f'./input_features.csv')

        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')

        # =============== Save the code used to training in S3 ======================
        for notebook in list(Path('/src/notebooks').glob('0*.ipynb')):
            log_artifact(str(notebook))
            
        for shared_code in list(Path('/src/shared').glob('*.py')):
            log_artifact(str(shared_code))
            
        for client_code in list(Path('/src/clients').glob('*.py')):
            log_artifact(str(client_code))
 
        log_artifact('./parameterTunningConfig.py')
            
        if agg_recall_to_optimize is not None:
            return 1 - agg_recall_to_optimize
        else:
            return 1 - total_valid_aucroc

start_time = timeit.default_timer()        
best = fmin(fn=objective,
        space=lgb_param_space,
        algo=tpe.suggest,
        max_evals=9)
print(f"==============Time taken for training {timeit.default_timer() - start_time}======================")

In [None]:
base_models