# Measure runtime for different interestingness measures with and without optimistic estimates

## Default Values for Papermill Parameters

In [None]:
PARAM_NUM_RUNS = 10
PARAM_QF_OUTPUT_FILENAME = "interestingness_measure.pickle"
PARAM_RESULT_SETS_OUT_FILENAME = None
PARAM_MEASUREMENTS_OUT_FILENAME = None
PARAM_DATA_IN_PATH = "../../data"
PARAM_MODELS_IN_PATH = "../../models"

# subgroup discovery parameters
PARAM_DATASET_NAME = "OpenML Adult"
PARAM_MODEL_NAME = "sklearn_gaussian_nb_adult"

PARAM_QF = "average_ranking_loss" # or "sklearn.metrics.roc_auc_score" or "prc_auc_score"
PARAM_ENABLE_OPTIMISTIC_ESTIMATE = "False"
PARAM_MINIMUM_SUPPORT_CONSTRAINT = None  # default: 1/100 of dataset size
PARAM_COVER_SIZE_WEIGHT = 0
PARAM_CLASS_BALANCE_WEIGHT = 0
PARAM_DEPTH = 3

PARAM_REMOVE_DUMMY_FALSE_SELECTORS = "True"

## Prepare constant variables that will be used throughout the notebook.

In [None]:
from subroc import util
from subroc.quality_functions.base_qf import OptimizationMode
from subroc.model_serialization import deserialize
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc.quality_functions.sklearn_metrics import get_soft_classification_metric_qf_dict
from subroc.quality_functions.custom_metrics import get_custom_metric_qf_dict
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc.quality_functions.base_qf import PredictionType
from subroc.sd_process import run_subgroup_discovery
from subroc.selectors import create_selectors

import pickle
import timeit
from tqdm import tqdm
import pandas as pd
import pysubgroup as ps
import math
import os

# fill environment variables into params
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)
PARAM_MODELS_IN_PATH = util.prepend_experiment_output_path(PARAM_MODELS_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../..")
STAGE_OUTPUT_PATH2 = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

if PARAM_RESULT_SETS_OUT_FILENAME is None:
    PARAM_RESULT_SETS_OUT_FILENAME = f"result_sets_{PARAM_QF}_{'with_oe' if util.str_to_bool(PARAM_ENABLE_OPTIMISTIC_ESTIMATE) else 'no_oe'}.csv"

if PARAM_MEASUREMENTS_OUT_FILENAME is None:
    PARAM_MEASUREMENTS_OUT_FILENAME = f"runtimes_{PARAM_QF}_{'with_oe' if util.str_to_bool(PARAM_ENABLE_OPTIMISTIC_ESTIMATE) else 'no_oe'}.csv"

##### Subgroup Discovery Parameters #####

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH, exist_ok=True)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_READY

# Model
model = deserialize(PARAM_MODELS_IN_PATH, PARAM_MODEL_NAME)

# Selectors: Defined in a later cell

# Constraints
DEPTH = PARAM_DEPTH  # configures the default constraints
CONSTRAINTS = []  # define later

# Optimistic Estimate
ENABLE_OPTIMISTIC_ESTIMATES = util.str_to_bool(PARAM_ENABLE_OPTIMISTIC_ESTIMATE)

# Interestingness Measure(s)
qfs_dict = {
    "average_ranking_loss": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["average_ranking_loss"],
    "sklearn.metrics.roc_auc_score": get_soft_classification_metric_qf_dict(
        OptimizationMode.Maximal,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["sklearn.metrics.roc_auc_score"],
    "prc_auc_score": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["prc_auc_score"],
}
QF = qfs_dict[PARAM_QF]
OPTIMIZATION_MODE = OptimizationMode.Maximal  # effectively modifies the interestingness measure

# Result Set Size
RESULT_SET_SIZE = 5

# Interestingness Minimum
MIN_QUALITY = 0

QF_NAME_COL = "qf_name"
OPTIMISTIC_ESTIMATE_COL = "optimistic_estimate"
DEPTH_COL = "depth"
SAMPLE_INDEX_COL = "sample_index"
TIME_COL = "time"
NUM_VISITED_SUBGROUPS_COL = "num_visited_subgroups"
OPTIMIZATION_MODE_COL = "optimization_mode"

## General data preparation

In [None]:
# read data and preprocess it for the model
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

# prepare classification predictions
dataset_meta.prediction_type = PredictionType.CLASSIFICATION_SOFT

train_data_x = train_data.loc[:, train_data.columns != dataset_meta.gt_name]
train_data[dataset_meta.score_name] = model.predict(train_data_x)
test_data_x = test_data.loc[:, test_data.columns != dataset_meta.gt_name]
test_data[dataset_meta.score_name] = model.predict(test_data_x)

# save data with predictions
out_path = DATA_OUT_PATH + "/" + dataset_meta.dataset_dir
if not os.path.exists(out_path):
    os.makedirs(out_path, exist_ok=True)

train_data.to_csv(out_path + "/" + "model_predicted_train.csv", index=False)
test_data.to_csv(out_path + "/" + "model_predicted_test.csv", index=False)

# sd objects
target = SoftClassifierTarget(dataset_meta.gt_name, dataset_meta.score_name)

## Define Selectors (Search Space)

In [None]:
IGNORE_NULL = True
intermediate_search_space = create_selectors(
    train_data,
    nbins=10,
    ignore=[dataset_meta.gt_name, dataset_meta.score_name],
    ignore_null=IGNORE_NULL
)

if util.str_to_bool(PARAM_REMOVE_DUMMY_FALSE_SELECTORS):
    # Dummy attributes contain values that correspond to inequality of an original nominal value.
    # Equality selectors with such values effectively define inequality selectors with the original nominal values.
    # They are therefore removed.
    SEARCH_SPACE = []
    for selector in intermediate_search_space:
        if isinstance(selector, ps.EqualitySelector) and not selector.attribute_value:
            continue

        SEARCH_SPACE.append(selector)
else:
    SEARCH_SPACE = intermediate_search_space

## Define Constraints

In [None]:
if PARAM_MINIMUM_SUPPORT_CONSTRAINT is not None:
    CONSTRAINTS.append(ps.MinSupportConstraint(PARAM_MINIMUM_SUPPORT_CONSTRAINT))
else:
    CONSTRAINTS.append(ps.MinSupportConstraint(math.ceil(len(test_data) / 100)))

## Measure runtime

In [None]:
times_dicts = []

results_per_qf_and_depth = []

qf_name = QF.__name__

print(f"Running qf {qf_name}{' with optimistic estimate' if QF.metric_optimistic_estimate is not None else ''} and {QF.optimization_mode}")

# enable the size and class imbalance term
QF.subgroup_size_weight = PARAM_COVER_SIZE_WEIGHT
QF.subgroup_class_balance_weight = PARAM_CLASS_BALANCE_WEIGHT

for i in tqdm(range(PARAM_NUM_RUNS)):
    start_time = timeit.default_timer()
    start_visited_subgroups_count = QF.calculate_statistics_invocation_count
    
    results = run_subgroup_discovery(
        [QF],
        [QF.__name__],
        train_data,
        dataset_meta,
        result_set_size=RESULT_SET_SIZE,
        ignore_null=IGNORE_NULL,
        summarize=False,
        target=target,
        constraints=CONSTRAINTS,
        search_space=SEARCH_SPACE,
        depth=DEPTH,
        min_quality=MIN_QUALITY,
        sd_algorithm=ps.BestFirstSearch()
    )[QF.__name__]
    
    end_time = timeit.default_timer()
    end_visited_subgroups_count = QF.calculate_statistics_invocation_count
    times_dicts.append({
        QF_NAME_COL: qf_name, 
        OPTIMISTIC_ESTIMATE_COL: QF.metric_optimistic_estimate is not None,
        DEPTH_COL: DEPTH,
        SAMPLE_INDEX_COL: i,
        TIME_COL: end_time - start_time,
        NUM_VISITED_SUBGROUPS_COL: end_visited_subgroups_count - start_visited_subgroups_count,
        OPTIMIZATION_MODE_COL: QF.optimization_mode
    })
    
    for result in results:
        result_dict = {"optimization_mode": QF.optimization_mode, "qf_name": QF.__name__, "search_depth": DEPTH, "interestingness": result[0][0], "pattern": result[0][1]}
        results_per_qf_and_depth.append(result_dict)

## Save the results

In [None]:
times_df = pd.DataFrame(times_dicts)
print(times_df)
times_df.to_csv(STAGE_OUTPUT_PATH2 + "/" + PARAM_MEASUREMENTS_OUT_FILENAME, index=False)

results_df = pd.DataFrame(results_per_qf_and_depth)
print(results_df)
results_df.to_csv(STAGE_OUTPUT_PATH2 + "/" + PARAM_RESULT_SETS_OUT_FILENAME, index=False)

## Save the Interestingness Measure

In [None]:
with open(STAGE_OUTPUT_PATH2 + "/" + PARAM_QF_OUTPUT_FILENAME, "wb") as file:
    pickle.dump(QF, file)

# Print Process Status Info

In [None]:
util.print_proc_status()