# Augment a Subgroup Discovery Result Set with Metric Values for each Subgroup

## Default Values for Papermill Parameters

In [33]:
PARAM_METRICS = ["average_ranking_loss"]
PARAM_RESULT_SET_PATH = "../outputs/merged_result_set.csv"
PARAM_AUGMENTATION_RESULT_FILENAME = "metrics_augmented_result_set.csv"
PARAM_DATA_IN_PATH = "../../data"

PARAM_DATASET_NAME = "OpenML Adult"

## Import and Set Parameters

In [34]:
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc.quality_functions.sklearn_metrics import get_soft_classification_metric_qf_dict
from subroc.quality_functions.base_qf import OptimizationMode
from subroc.quality_functions.custom_metrics import get_custom_metric_qf_dict
from subroc.preconditions import constraint_for_precondition_disjunction
from subroc.util import create_subgroup
from subroc import util

import pandas as pd
import numpy as np
import os

# fill environment variables into params
PARAM_RESULT_SET_PATH = util.prepend_experiment_output_path(PARAM_RESULT_SET_PATH)
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_PREDICTED

# interestingness measures to take the metrics from
qfs_dict = {
    "average_ranking_loss": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
    )["average_ranking_loss"],
    "sklearn.metrics.roc_auc_score": get_soft_classification_metric_qf_dict(
        OptimizationMode.Maximal,
    )["sklearn.metrics.roc_auc_score"],
    "prc_auc_score": get_custom_metric_qf_dict(
        OptimizationMode.Maximal,
    )["prc_auc_score"],
}
metrics = [qfs_dict[param_metric].metric for param_metric in PARAM_METRICS]

## Read and Sort the Data

In [35]:
# read data
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

# sort data and set up some datastructures to access sorted data
dataset_sorted_by_score = test_data.sort_values(dataset_meta.score_name)
scores_sorted = dataset_sorted_by_score.loc[:, dataset_meta.score_name]
gt_sorted_by_score = dataset_sorted_by_score.loc[:, dataset_meta.gt_name]
sorted_to_original_index = [index for index, _ in dataset_sorted_by_score.iterrows()]

## Create the Constraints

In [None]:
constraints_per_metric = []

for param_metric in PARAM_METRICS:
    qf = qfs_dict[param_metric]
    metric_constraints = []

    if hasattr(qf, "preconditions"):
        for precondition_disjunction in qf.preconditions:
            metric_constraints.append(
                constraint_for_precondition_disjunction(precondition_disjunction, test_data, dataset_meta))
    
    constraints_per_metric.append(metric_constraints)

constraints_per_metric

## Read the Result Set

In [37]:
result_set = pd.read_csv(f"{PARAM_RESULT_SET_PATH}")

## Augment the Result Set

In [38]:
original_columns = result_set.columns.values.tolist()
metrics_augmented_result_set = pd.DataFrame(columns=(original_columns + PARAM_METRICS))

for i, result in enumerate(result_set.itertuples()):
    # recreate the pysubgroup object for the subgroup with a representation for the dataset
    sel_conjunction = util.from_str_Conjunction(result.pattern)
    subgroup = create_subgroup(test_data, sel_conjunction.selectors)
    
    # get true and predicted labels for subgroup cover
    sorted_subgroup_representation = \
        [subgroup.representation[original_index] for original_index in sorted_to_original_index]
    sorted_subgroup_y_true = gt_sorted_by_score[sorted_subgroup_representation].to_numpy()
    sorted_subgroup_y_pred = scores_sorted[sorted_subgroup_representation].to_numpy()
    
    # compute the metric values
    metric_values = []
    for metric, constraints in zip(metrics, constraints_per_metric):
        if all([constraint.is_satisfied(subgroup, data=test_data) for constraint in constraints]):
            metric_values.append(metric(sorted_subgroup_y_true, sorted_subgroup_y_pred))
        else:
            metric_values.append(np.nan)
    
    # append the augmented instance to metrics_augmented_result_set
    metrics_augmented_result_set.loc[i] = [result[i+1] for i in range(len(original_columns))] + metric_values

## Write the Augmentation Result

In [39]:
metrics_augmented_result_set.to_csv(f"{STAGE_OUTPUT_PATH}/{PARAM_AUGMENTATION_RESULT_FILENAME}", index=False)