# Series of experiments on turning the ARL metric into a useful interestingness measure

## Default Values for Papermill Parameters

In [17]:
PARAM_QF = "average_ranking_loss" # or "sklearn.metrics.roc_auc_score" or "prc_auc_score"
PARAM_QF_OUTPUT_BASENAME = "interestingness_measure"
PARAM_RESULT_SET_OUTPUT_BASENAME = ""
PARAM_DATA_IN_PATH = "../../data"
PARAM_MODELS_IN_PATH = "../../models"

PARAM_INCLUDE_STATISTICS = "False"

# subgroup discovery parameters
PARAM_DATASET_NAME = "OpenML Adult"
PARAM_DATASET_STAGE = None
PARAM_MODEL_NAME = "sklearn_gaussian_nb_adult"
PARAM_SD_ALGORITHM = "Apriori"
PARAM_OPTIMIZATION_MODE = "Maximal"
PARAM_ENABLE_OPTIMISTIC_ESTIMATE = "True"
PARAM_ENABLE_MINIMUM_PER_CLASS_SUPPORT_CONSTRAINT = "False"
PARAM_MINIMUM_PER_CLASS_SUPPORT_CONSTRAINT_VALUE = 10
PARAM_MINIMUM_SUPPORT_CONSTRAINT = None  # default: 1/100 of dataset size
PARAM_COVER_SIZE_WEIGHT = 1
PARAM_CLASS_BALANCE_WEIGHT = 1
PARAM_DEPTH = 3
PARAM_RESULT_SET_SIZE = 5
PARAM_ENABLE_RANDOM_SAMPLING_P_VALUE_FACTOR = "False"
PARAM_NUM_RANDOM_SAMPLES = 10000
PARAM_ENABLE_GENERALIZATION_AWARENESS = "False"
PARAM_SEED = 0

## Import and Set Parameters

In [None]:
from subroc.model_serialization import deserialize
from subroc.quality_functions.base_qf import OptimizationMode, PredictionType
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc.quality_functions.custom_metrics import get_custom_metric_qf_dict
from subroc.quality_functions.sklearn_metrics import get_soft_classification_metric_qf_dict
from subroc.selectors import create_selectors
from subroc.sd_process import run_subgroup_discovery
from subroc import util

import pysubgroup as ps
import pandas as pd
import pickle
import math
import os

# fill environment variables into params
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)
PARAM_MODELS_IN_PATH = util.prepend_experiment_output_path(PARAM_MODELS_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../..")
STAGE_OUTPUT_PATH2 = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

if PARAM_RESULT_SET_OUTPUT_BASENAME == "":
    PARAM_RESULT_SET_OUTPUT_BASENAME = f"sd_result_set_{PARAM_QF}"

PARAM_INCLUDE_STATISTICS = util.str_to_bool(PARAM_INCLUDE_STATISTICS)

##### Subgroup Discovery Parameters #####

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

if PARAM_DATASET_STAGE is None:
    DATASET_STAGE = DatasetStage.PROCESSED_MODEL_READY
else:
    DATASET_STAGE = DatasetStage(PARAM_DATASET_STAGE)

# Model
model = deserialize(PARAM_MODELS_IN_PATH, PARAM_MODEL_NAME)

# SD Algorithm
if str.lower("Apriori") == str.lower(PARAM_SD_ALGORITHM):
    PARAM_SD_ALGORITHM = ps.Apriori()
elif str.lower("BestFirstSearch") == str.lower(PARAM_SD_ALGORITHM):
    PARAM_SD_ALGORITHM = ps.BestFirstSearch()

# Selectors: Defined in a later cell

# Constraints
DEPTH = PARAM_DEPTH  # configures the default constraints
CONSTRAINTS = []  # define later

# Constraints may be updated in a later cell according to this param.
PARAM_ENABLE_MINIMUM_PER_CLASS_SUPPORT_CONSTRAINT = util.str_to_bool(PARAM_ENABLE_MINIMUM_PER_CLASS_SUPPORT_CONSTRAINT)

# Optimistic Estimate
ENABLE_OPTIMISTIC_ESTIMATES = util.str_to_bool(PARAM_ENABLE_OPTIMISTIC_ESTIMATE)

# Interestingness Measure(s)
qfs_dict = {
    "average_ranking_loss": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
        seed=PARAM_SEED,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["average_ranking_loss"],
    "sklearn.metrics.roc_auc_score": get_soft_classification_metric_qf_dict(
        OptimizationMode.Maximal,
        seed=PARAM_SEED,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["sklearn.metrics.roc_auc_score"],
    "prc_auc_score": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
        seed=PARAM_SEED,
        enable_optimistic_estimates=ENABLE_OPTIMISTIC_ESTIMATES,
    )["prc_auc_score"],
}
QF = qfs_dict[PARAM_QF]

QF.random_sampling_p_value_factor = util.str_to_bool(PARAM_ENABLE_RANDOM_SAMPLING_P_VALUE_FACTOR)
QF.num_random_samples = PARAM_NUM_RANDOM_SAMPLES

PARAM_ENABLE_GENERALIZATION_AWARENESS = util.str_to_bool(PARAM_ENABLE_GENERALIZATION_AWARENESS)

# the optimization mode effectively modifies the interestingness measure
if str.lower(PARAM_OPTIMIZATION_MODE) == "maximal":
    OPTIMIZATION_MODE = OptimizationMode.Maximal  
elif str.lower(PARAM_OPTIMIZATION_MODE) == "minimal":
    OPTIMIZATION_MODE = OptimizationMode.Minimal
else:
    OPTIMIZATION_MODE = OptimizationMode.Exceptional

# Result Set Size
RESULT_SET_SIZE = PARAM_RESULT_SET_SIZE

# Interestingness Minimum
MIN_QUALITY = 0

## Get and Preprocess the Data

In [None]:
# read data and preprocess it for the model
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

if DATASET_STAGE == DatasetStage.PROCESSED_MODEL_READY:
    # prepare classification predictions
    dataset_meta.prediction_type = PredictionType.CLASSIFICATION_SOFT

    train_data_x = train_data.loc[:, train_data.columns != dataset_meta.gt_name]
    train_data[dataset_meta.score_name] = model.predict(train_data_x)
    test_data_x = test_data.loc[:, test_data.columns != dataset_meta.gt_name]
    test_data[dataset_meta.score_name] = model.predict(test_data_x)

    # save data with predictions
    out_path = DATA_OUT_PATH + "/" + dataset_meta.dataset_dir
    if not os.path.exists(out_path):
        os.mkdir(out_path)

    train_data.to_csv(out_path + "/" + "model_predicted_train.csv", index=False)
    test_data.to_csv(out_path + "/" + "model_predicted_test.csv", index=False)

# sd objects
target = SoftClassifierTarget(dataset_meta.gt_name, dataset_meta.score_name)

## Define Selectors (Search Space)

In [20]:
IGNORE_NULL = True
intermediate_search_space = create_selectors(
    test_data,
    nbins=10,
    ignore=[dataset_meta.gt_name, dataset_meta.score_name],
    ignore_null=IGNORE_NULL
)

# Dummy attributes contain values that correspond to inequality of an original nominal value.
# Equality selectors with such values effectively define inequality selectors with the original nominal values.
# They are therefore removed.
SEARCH_SPACE = []
for selector in intermediate_search_space:
    if isinstance(selector, ps.EqualitySelector) and not selector.attribute_value:
        continue
    
    SEARCH_SPACE.append(selector)

## Define Constraints (minimum per-class support)

In [None]:
if PARAM_MINIMUM_SUPPORT_CONSTRAINT is not None:
    CONSTRAINTS.append(ps.MinSupportConstraint(PARAM_MINIMUM_SUPPORT_CONSTRAINT))
else:
    CONSTRAINTS.append(ps.MinSupportConstraint(math.ceil(len(test_data) / 100)))

# Run Subgroup Discovery

In [None]:
QF.optimization_mode = OPTIMIZATION_MODE

print(f"--- {QF.__name__} with {OPTIMIZATION_MODE} ---")

# enable the size and class imbalance term
QF.subgroup_size_weight = PARAM_COVER_SIZE_WEIGHT
QF.subgroup_class_balance_weight = PARAM_CLASS_BALANCE_WEIGHT

results = run_subgroup_discovery(
    [QF],
    [QF.__name__],
    test_data,
    dataset_meta,
    result_set_size=RESULT_SET_SIZE,
    ignore_null=IGNORE_NULL,
    summarize=True,
    target=target,
    constraints=CONSTRAINTS,
    search_space=SEARCH_SPACE,
    depth=DEPTH,
    min_quality=MIN_QUALITY,
    enable_generalization_awareness=PARAM_ENABLE_GENERALIZATION_AWARENESS,
    sd_algorithm=PARAM_SD_ALGORITHM
)[QF.__name__]
# results_flat = functools.reduce(operator.iconcat, results, [])
# results_flat.sort(key=lambda x: x[0], reverse=True)

results_dicts = []
for outer_result in results:
    for inner_result in outer_result:
        result_dict = {"interestingness": inner_result[0], "pattern": inner_result[1]}
        
        if PARAM_INCLUDE_STATISTICS:
            result_dict |= inner_result[2]

        results_dicts.append(result_dict)

## Save the Result Sets

In [None]:
results_df = None

if not PARAM_INCLUDE_STATISTICS:
    results_df = pd.DataFrame(results_dicts, columns=["interestingness", "pattern"])
else:
    result_df = pd.DataFrame(results_dicts)

results_df.to_csv(f"{STAGE_OUTPUT_PATH2}/{PARAM_RESULT_SET_OUTPUT_BASENAME}.csv", index=False)

## Save the Interestingness Measure

In [24]:
if PARAM_ENABLE_GENERALIZATION_AWARENESS:
    QF = ps.GeneralizationAwareQF(QF)

with open(STAGE_OUTPUT_PATH2 + "/" + PARAM_QF_OUTPUT_BASENAME + ".pickle", "wb") as file:
    pickle.dump(QF, file)

# Print Process Status Info

In [None]:
util.print_proc_status()