# Augment a Subgroup Discovery Result Set with Statistics (Cover Size and Class Balance) for each Subgroup

## Default Values for Papermill Parameters

In [23]:
PARAM_RESULT_SET_PATH = "../outputs/metrics_augmented_result_set.csv"
PARAM_AUGMENTATION_RESULT_FILENAME = "statistics_augmented_result_set.csv"
PARAM_DATA_IN_PATH = "../../data"

PARAM_DATASET_NAME = "OpenML Adult"
PARAM_ENABLE_STATISTIC_COVER_SIZE = "True"
PARAM_ENABLE_STATISTIC_CLASS_BALANCE = "True"
PARAM_ENABLE_STATISTIC_NEGATIVE_CLASS_RATIO = "False"

## Import and Set Parameters

In [24]:
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc.quality_functions.base_qf import label_balance_fraction
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc.util import create_subgroup
from subroc import util

import pandas as pd
import os

# fill environment variables into params
PARAM_RESULT_SET_PATH = util.prepend_experiment_output_path(PARAM_RESULT_SET_PATH)
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_PREDICTED

PARAM_ENABLE_STATISTIC_COVER_SIZE = util.str_to_bool(PARAM_ENABLE_STATISTIC_COVER_SIZE)
PARAM_ENABLE_STATISTIC_CLASS_BALANCE = util.str_to_bool(PARAM_ENABLE_STATISTIC_CLASS_BALANCE)
PARAM_ENABLE_STATISTIC_NEGATIVE_CLASS_RATIO = util.str_to_bool(PARAM_ENABLE_STATISTIC_NEGATIVE_CLASS_RATIO)

## Read the Data

In [25]:
# read data
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

## Read the Result Set

In [26]:
result_set = pd.read_csv(f"{PARAM_RESULT_SET_PATH}")

## Augment the Result Set

In [27]:
original_columns = result_set.columns.values.tolist()

# set the columns according to the enabled statistics
statistic_columns = []
if PARAM_ENABLE_STATISTIC_COVER_SIZE:
    statistic_columns.append("cover_size")
if PARAM_ENABLE_STATISTIC_CLASS_BALANCE:
    statistic_columns.append("class_balance")
if PARAM_ENABLE_STATISTIC_NEGATIVE_CLASS_RATIO:
    statistic_columns.append("negative_class_ratio")
metrics_augmented_result_set = pd.DataFrame(columns=(original_columns + statistic_columns))

# sd objects
target = SoftClassifierTarget(dataset_meta.gt_name, dataset_meta.score_name)

for i, result in enumerate(result_set.itertuples()):
    # recreate the pysubgroup object for the subgroup with a representation for the dataset
    sel_conjunction = util.from_str_Conjunction(result.pattern)
    subgroup = create_subgroup(test_data, sel_conjunction.selectors)
    
    # compute the enabled statistics
    statistics = []
    if PARAM_ENABLE_STATISTIC_COVER_SIZE:
        statistics.append(sum(subgroup.representation))
    if PARAM_ENABLE_STATISTIC_CLASS_BALANCE:
        statistics.append(label_balance_fraction(test_data.loc[subgroup.representation, target.gt_name]))
    if PARAM_ENABLE_STATISTIC_NEGATIVE_CLASS_RATIO:
        statistics.append(sum(test_data.loc[subgroup.representation][target.gt_name] == 0) / len(test_data.loc[subgroup.representation]))
    
    # append the augmented instance to metrics_augmented_result_set
    metrics_augmented_result_set.loc[i] = [result[i+1] for i in range(len(original_columns))] + statistics

## Write the Augmentation Result

In [28]:
metrics_augmented_result_set.to_csv(f"{STAGE_OUTPUT_PATH}/{PARAM_AUGMENTATION_RESULT_FILENAME}", index=False)