In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import mlflow
import shap

import sys

from saiva.training import load_lgb_model, download_model_from_mlflow, load_x_y_idens
from saiva.training.data_models import BaseModel
from saiva.training.metrics import run_test_set

## =================== Experiment exploration ===================

In [None]:
monitor = 'metrics.TEST_02_upt_recall_at_rank_15'
experiment_id = 466

df = mlflow.search_runs(experiment_ids=[experiment_id])

try:
    best_run_config = df.nlargest(1, monitor).iloc[0].to_dict()
except:
    best_run_config = dict()
    
BEST_AUC = best_run_config.get('metrics.TEST_01_aucroc', None)
LEARNING_RATE = best_run_config.get('params.hp__learning_rate', None)
NUM_ITERATIONS = best_run_config.get('params.p__best_iteration', None)
MODELID = best_run_config.get('run_id', None)
    
print(f'Best model has AUC = {BEST_AUC}')
print(f'With total estimators = {NUM_ITERATIONS}')
print(f'And learning rate = {LEARNING_RATE}')
print(f'Model ID = {MODELID}')

## ============== Download Model from MLflow ===================

In [None]:
modelid = '4e56c8c354554a0bacd52765b9521897'
download_model_from_mlflow(modelid)

## =============== Load Model from local folder ===================

In [None]:
modelid = '4e56c8c354554a0bacd52765b9521897'
model = load_lgb_model(modelid)

## ============= List Feature Importance of the model ==============

In [None]:
feature_imp = (
    pd.DataFrame({
        'feature': model.feature_name(),
        'importance': model.feature_importance(importance_type='gain'),  # split
    })
    .sort_values('importance', ascending=False)
)
feature_imp.head(10)

In [None]:
lgb.plot_importance(model, max_num_features=50, figsize=(15,15))

## ============== Run test on pre-loaded model ===================

In [None]:
x, y, idens = load_x_y_idens('/data/processed/', 'model_upt', 'test')

run_test_set(
    model,
    modelid,
    modelid,
    test_start_date = idens['censusdate'].min().strftime('%Y-%m-%d'),
    test_end_date = idens['censusdate'].max().strftime('%Y-%m-%d'),
    x_df = x,
    target_3_day = y,
    idens = idens,
    model_type = 'upt',
    threshold = 0.15,
    log_in_mlflow = False
)

## ============== Run Shap Explanations for Test Set ==============

In [None]:
# Shap takes lot of time to run across all test dataset. Since certain index and run shap for faster results 
n = 50

explainer = shap.TreeExplainer(model)
subset = x.sample(n)
shap_values = explainer.shap_values(subset)

shap_results = []

for i, (idx, row) in enumerate(subset.iterrows()):
    shaps = pd.DataFrame(
        {
            "feature": subset.columns,
            "attribution_score": shap_values[i] if (model.params.get('objective') != 'binary') \
                                                else shap_values[1][i],
            "feature_value": subset.loc[idx],
        }
    )

    shaps["masterpatientid"] = idens.iloc[idx].masterpatientid
    shaps["facilityid"] = idens.iloc[idx].facilityid
    shaps["censusdate"] = idens.iloc[idx].censusdate

    shap_results.append(shaps)

results = pd.concat(shap_results)

In [None]:
results.query('attribution_score > 0.1').sort_values(by=['attribution_score'], ascending=False)['feature'].value_counts().head(25)