In [None]:
import numpy as np
import pandas as pd
import mlflow
from pathlib import Path
import sys
import pickle

from saiva.model.shared.constants import MODEL_TYPE
MODEL_TYPE = MODEL_TYPE.lower()
from saiva.model.shared.utils import get_client_class
from saiva.training import (
    train_optuna_integration, 
    train_optuna_pure_lgbm_model, 
    get_facilities_from_train_data, 
    IdensDataset, 
    load_x_y_idens, 
    load_config,
)

## Load config

In [None]:
from saiva.model.shared.constants import LOCAL_TRAINING_CONFIG_PATH
from saiva.training.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

### ========= Set the CONFIG & HYPER_PARAMETER_TUNING in constants.py ==========

In [None]:
EXPERIMENT_DATES = training_config.training_metadata.experiment_dates
CLIENT = "+".join([config.organization_id for config in training_config.organization_configs])
vector_model = training_config.training_metadata.vector_model

TRAINING_DATA=CLIENT   # trained on which data? e.g. avante + champion
SELECTED_MODEL_VERSION = 'saiva-3-day-upt_v6'    # e.g. v3, v4 or v6 model

# Name used to filter models in AWS quicksight & also used as ML Flow experiment name
MODEL_DESCRIPTION = f'{CLIENT}-3-day-upt-v6' # e.g. 'avante-upt-v6-model'

print('MODEL_TYPE:', MODEL_TYPE)
print('HYPER_PARAMETER_TUNING:', training_config.training_metadata.hyper_parameter_tuning)  
print('CLIENT:', CLIENT)
EXPERIMENT_DATES

## ============ Initialise MLFlow Experiment =============

In [None]:
# Create an ML-flow experiment
mlflow.set_tracking_uri('http://mlflow.saiva-dev')

# Experiment name which appears in ML flow
mlflow.set_experiment(MODEL_DESCRIPTION)

EXPERIMENT = mlflow.get_experiment_by_name(MODEL_DESCRIPTION)
MLFLOW_EXPERIMENT_ID = EXPERIMENT.experiment_id

print(f'Experiment ID: {MLFLOW_EXPERIMENT_ID}')

## =================== Loading data ======================

In [None]:
processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [None]:
train_x, train_target_3_day, train_idens = load_x_y_idens(processed_path, MODEL_TYPE, 'train')
valid_x, valid_target_3_day, valid_idens = load_x_y_idens(processed_path, MODEL_TYPE, 'valid')
test_x, test_target_3_day, test_idens = load_x_y_idens(processed_path, MODEL_TYPE, 'test')

In [None]:
print(train_x.shape)
print(train_target_3_day.shape)
print(valid_x.shape)
print(valid_target_3_day.shape)
print(test_x.shape)
print(test_target_3_day.shape)

In [None]:
with open(processed_path/'cate_columns.pickle', 'rb') as f: cate_columns = pickle.load(f)
with open(processed_path/'feature_names.pickle', 'rb') as f: feature_names = pickle.load(f)
with open(processed_path/'pandas_categorical.pickle', 'rb') as f: pandas_categorical = pickle.load(f)

In [None]:
info_cols = ['facilityid', 'censusdate', 'masterpatientid', f'positive_date_{MODEL_TYPE}', 'LFS', 'long_short_term']

train_data = IdensDataset(
    train_x,
    label=train_target_3_day,
    idens=train_idens.loc[:,info_cols],
    feature_name=feature_names,
    categorical_feature=cate_columns,
    free_raw_data=False
)
valid_data = IdensDataset(
    valid_x,
    label=valid_target_3_day,
    idens=valid_idens.loc[:,info_cols],
    feature_name=feature_names,
    categorical_feature=cate_columns,
    free_raw_data=False
)
test_data = IdensDataset(
    test_x,
    label=test_target_3_day,
    idens=test_idens.loc[:,info_cols],
    feature_name=feature_names,
    categorical_feature=cate_columns
)

## =================== Model Training: *optuna_integration*===================

We have a new training method. After calling it, wait for 5 minutes, make sure everything is working properly. If there are no issues, you can start doing something else. Typically, this process takes around 12-24 hours (depending on the size of the dataset), and you can track the results through mlflow.

In [None]:
params = {
    "seed": 1,
    "metric": "auc",
    "verbosity": 5,
    "boosting_type": "gbdt",
    }

model = train_optuna_integration(
    params,
    train_data,
    valid_data,
    test_data,
    training_config.training_metadata.vector_model,
    MODEL_TYPE,
    EXPERIMENT_DATES,
    training_config.training_metadata.hyper_parameter_tuning,
    TRAINING_DATA,
    SELECTED_MODEL_VERSION,
    MODEL_DESCRIPTION,
    MLFLOW_EXPERIMENT_ID,
    training_config.training_metadata.optuna_time_budget,
    pandas_categorical,
    config
)

## =================== Model Training: *optuna_pure_lgbm*===================

In [None]:
params = {
    "seed": 1,
    "metric": "auc",
    "verbosity": 5,
    "boosting_type": "gbdt",
    }

model = train_optuna_pure_lgbm_model(
    params,
    train_data,
    valid_data,
    test_data,
    training_config.training_metadata.vector_model,
    MODEL_TYPE,
    EXPERIMENT_DATES,
    training_config.training_metadata.hyper_parameter_tuning,
    TRAINING_DATA,
    SELECTED_MODEL_VERSION,
    MODEL_DESCRIPTION,
    MLFLOW_EXPERIMENT_ID,
    training_config.training_metadata.optuna_time_budget,
    pandas_categorical,
    config,
    n_trials=1
)