## How to run this?

1. You need to run the training notebook from `01` to `04` by making necessary changes that each notebook asks.
    1.1 If you want to generate the first 2 batch data-cards, please follow the necessary notebooks for that.
2. At the end of `04` notebook we store the `final_df.parquet` in the `/data/processed/` folder.
3. We use that file here and mimic the prediction pipeline for entire batch.
4. Change the model ID in the notebook in **Download Model from MLflow** section.
5. Change the experiment ID in the **Experiment exploration** if you want to check some stats. We do not use it in predictions though.

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import mlflow
import os
from pathlib import Path
from datetime import timedelta, datetime
import re
import pickle
from omegaconf import OmegaConf
from eliot import start_action, start_task, log_message, to_file


import sys

from saiva.training import load_lgb_model, download_model_from_mlflow, load_x_y_idens
from saiva.training.data_models import BaseModel
from saiva.training.metrics import run_test_set
from saiva.model.shared.utils import get_client_class, url_encode_cols
from saiva.model.explanations.config import FEATURE_TYPE_MAPPING

from batch_preds_utils import prep, download_model_cat_cols_list_mlflow, preprocess_final_df

to_file(sys.stdout)

## ============== Define your constants here ===================

In [None]:
modelid = '8a1c3903cf1e4e09ba3c491a7b999603'
CLIENT = "avante"
MODEL_TYPE = "MODEL_UPT"
TEST_START_DATE = "2024-01-01"
TEST_END_DATE = "2024-01-31"
FACILITY_IDS = None  ## keep None if you want to run prediction on all facilities else provide list of facilities as integer like [1,3,5,7]

processed_path = Path('/data/processed/')
filename = "final_df.parquet"

## ============== Download Model from MLflow ===================

In [None]:
download_model_from_mlflow(modelid)
download_model_cat_cols_list_mlflow(modelid, "/data/model/")

## =============== Load Model from local folder ===================

In [None]:
lgb_model = load_lgb_model(modelid)
model = BaseModel(model_name=MODEL_TYPE,
                  model_type="lgb",
                  model=lgb_model)
model.truncate_v6_suffix()

In [None]:
# load categorical columns from pickle
with open(f'/data/model/cate_columns.pickle', 'rb') as f:
    cate_cols = pickle.load(f)

## ============== Run inference on feature engineered data ===================

In [None]:
IDEN_COLS = ['censusdate', 'facilityid', 'masterpatientid', 'LFS', 'primaryphysicianid',
         'payername', 'to_from_type', 'client', 'admissionstatus', f'positive_date_{MODEL_TYPE.lower()}']

In [None]:
test = preprocess_final_df(os.path.join(processed_path, filename),
                          MODEL_TYPE.lower(),
                          TEST_START_DATE,
                          TEST_END_DATE,
                          FACILITY_IDS
                          )

In [None]:
%%time
test_x, test_target_3_day, test_idens = prep(test,
                                             model,
                                             client=CLIENT,
                                             iden_cols=IDEN_COLS,
                                             pandas_categorical=model.model.pandas_categorical,
                                             categorical_columns=cate_cols,
                                             target_col=f'target_3_day_{MODEL_TYPE.lower()}')

test_x.shape, test_target_3_day.shape, test_idens.shape

In [None]:
# here we are storing the created test_x, test_y and idens in the /data/test folder so you can run another client featurization if you want.
if not os.path.exists('/data/test/'):
    os.makedirs('/data/test')

with open(f'/data/test/final-test_x_{MODEL_TYPE.lower()}.pickle','wb') as f: pickle.dump(test_x, f, protocol=4)
with open(f'/data/test/final-test_target_3_day_{MODEL_TYPE.lower()}.pickle','wb') as f: pickle.dump(test_target_3_day, f, protocol=4)
with open(f'/data/test/final-test_idens_{MODEL_TYPE.lower()}.pickle','wb') as f: pickle.dump(test_idens, f, protocol=4)

In [None]:
x, y, idens = load_x_y_idens('/data/test/', MODEL_TYPE.lower() , 'test')


test_total_aucroc, test_recall, test_recall_LE30, test_recall_G30, test_short_term_recall, test_long_term_recall = run_test_set(
    model,
    modelid,
    modelid,
    test_start_date = test['censusdate'].min().strftime('%Y-%m-%d'),
    test_end_date = test['censusdate'].max().strftime('%Y-%m-%d'),
    x_df = x,
    target_3_day = y,
    idens = idens,
    model_type = MODEL_TYPE.lower(),
    dataset = 'TEST',
    log_in_mlflow=False,
    threshold=0.15
)