In [None]:
import sys
import os
import joblib
import time

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
import optuna

sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('..'))

from src.train import _init_model, start_training, get_args
from src.utils.load_data_utils import get_data, get_train_eval_data
import src.rfe as rfe
from src.eval import evaluate_model
from src.tune import get_tune_args


# 1. Load best model params

In [None]:
############ Specify this: ########################
study_name = "Nov-14-21:23:35_xgb_prauc_3_clinical_20000"
####################################################

def get_best_trial(study_name):
    best_args = ''
    study_folder = os.path.join("optuna_studies", study_name)
    for file in os.listdir(study_folder):
        if file.endswith("rank2.pkl"):
            best_args = joblib.load(os.path.join(study_folder, file))
    return best_args

best_args = get_best_trial(study_name)
best_args

In [None]:
study = optuna.study.load_study(study_name, storage="sqlite:///optuna.db")
study_sorted = study.trials_dataframe().sort_values('value', ascending=False)

In [None]:
study_sorted

In [None]:
study.best_trial

# 2. Retrain model with best hyperparams for further evaluation

In [None]:
# use dev set 
best_args.nf = 0
best_args.split = "dev/test" 

# save retrained model
# best_args.save = True

# load data
x_train, y_train, x_eval, y_eval, _, feature_names, class_weights = get_data(best_args) 

# retrain with best hyperparams
_, models = start_training(best_args, None, 'auc', x_train, y_train, x_eval, y_eval, feature_names, class_weights)
models

# 3. Evaluate retrained model

### (optional) If you want to load a specific model from the `models` directory:
... else uses the above returned models

In [None]:
############ Specify this (optional): ########################
model_name = ""
####################################################

if model_name != "":
    model_folder = os.path.abspath('../../models')
    model_path = os.path.join(model_folder, model_name)
    models = [np.load(model_path)]
    print("Loading `model` from directory")
else:
    print("")

## Run evaluation (pre RFE)
Works for both single and multiple models / folds

In [None]:
# use test split
test_args = best_args
test_args.split = "dev/test" # will be used only if nf set to 0
test_args.nf = 0
_, _, x_test, y_test, _, feature_names, _ = get_data(test_args) 

# define output directory for eval results
test_output_dir = os.path.join("../final_eval_results", study_name, "pre-rfe")
os.makedirs(test_output_dir, exist_ok=True)

# run evaluation (computs cumulative metrics if more than one model is evaluated here)
evaluate_model(models, x_test, y_test, test_output_dir, feature_names, test_args)

#print(f"Find evaluation results in\n {test_output_dir}.")
#study = optuna.study.load_study(study_name, storage="sqlite:///optuna.db")
#print(f"\noptuna value : {study.best_trial.value:.3f}")

## Feature importances

In [None]:
feature_importances = pd.read_csv("/home/angelie/Documents/AdaLab/pharmaimage/final_eval_results/Nov-14-21:25:01_xgb_prauc_3_clinical,blood_20000/pre-rfe/XGBClassifier_feature_importances.csv")
feature_importances = feature_importances.sort_values(0, axis=1, ascending=False)
feature_importances[feature_importances == 0] = np.nan
feature_importances = feature_importances.dropna(axis=1)
feature_importances

In [None]:
feature_importances.loc[:, :'blood_T1_SORL1']

In [None]:
feature_importances.columns

In [None]:
feature_importances.loc[0, :].sum()

# 4. Run RFE on re-trained model

In [None]:
# Set rfe args and combine with all other relevant args
rfe_args = {'use_best_args': 1, 'study': study_name, 'rfe_ratio': 0.1, 'rfe_ksplits': 3, 'rfe_njobs': 1, 'rfe_scoring': 'average_precision'}
tune_args = get_tune_args({**rfe_args, **vars(best_args)})
rfe_combined_args = get_args({**tune_args})
rfe_combined_args.split = 'dev/test' # performs k-fold on dev set

# Run RFE
rfe.main(rfe_combined_args, study=study_name)

# 5. Post-RFE model retraining

In [None]:
rfe_features = np.load(os.path.join("optuna_studies", study_name, "RFE_features.pkl"), allow_pickle=True)

In [None]:
def get_rfe(splits, rfe_features):
    for i, split in enumerate(splits):
        rfe_indcs = [feature_names.index(feature) for feature in rfe_features]
        splits[i] = np.array(split)[:, rfe_indcs]
    return splits

In [None]:
x_train, y_train, x_eval, y_eval, n_features, feature_names, class_weights = get_data(best_args) 
x_train, x_eval = get_rfe(x_train, rfe_features), get_rfe(x_eval, rfe_features)
# retrain with best hyperparams
_, models = start_training(best_args, None, 'auc', x_train, y_train, x_eval, y_eval, rfe_features, class_weights)
models

## Evaluate

In [None]:
# use test split
test_args = best_args
test_args.split = "dev/test"
test_args.nf = 0
_, _, x_test, y_test, _, feature_names, _ = get_data(test_args) 
x_test = get_rfe(x_test, rfe_features)

# define output directory for eval results
test_output_dir = os.path.join("../final_eval_results", study_name, "post-rfe")
os.makedirs(test_output_dir, exist_ok=True)

# run evaluation (computs cumulative metrics if more than one model is evaluated here)
evaluate_model(models, x_test, y_test, test_output_dir, rfe_features, test_args)

print(f"Find evaluation results in\n {test_output_dir}.")

# 6. After post-RFE retuning (done external to this script) 