# Augment a Subgroup Discovery Result Set with Metric Values for each Subgroup

## Default Values for Papermill Parameters

In [1]:
PARAM_RESULT_SET_PATH = "../outputs/sd_result_set_average_ranking_loss.csv"
PARAM_RESULT_OUT_FILENAME_NO_SUFFIX = "with_empty_pattern"
PARAM_INTERESTINGNESS_MEASURE_PATH = "../outputs/interestingness_measure.pickle"
PARAM_DATA_IN_PATH = "../../data"
PARAM_MODELS_IN_PATH = "../../models"

PARAM_DATASET_NAME = "OpenML Adult"
PARAM_MODEL_NAME = "sklearn_gaussian_nb_adult"
PARAM_FIXED_INTERESTINGNESS = None

## Import and Set Parameters

In [2]:
from subroc.model_serialization import to_ModelName
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc.util import create_subgroup
from subroc import util

import pandas as pd
import pickle
import os

# fill environment variables into params
PARAM_RESULT_SET_PATH = util.prepend_experiment_output_path(PARAM_RESULT_SET_PATH)
PARAM_INTERESTINGNESS_MEASURE_PATH = util.prepend_experiment_output_path(PARAM_INTERESTINGNESS_MEASURE_PATH)
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)
PARAM_MODELS_IN_PATH = util.prepend_experiment_output_path(PARAM_MODELS_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_PREDICTED

# Model
MODEL_NAME = to_ModelName(PARAM_MODEL_NAME)

if MODEL_NAME is None:
    print(f"model name '{PARAM_MODEL_NAME}' not supported.")

## Read and Sort the Data

In [3]:
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

## Read the Result Set

In [4]:
result_set = pd.read_csv(f"{PARAM_RESULT_SET_PATH}")

## Read the Interestingness Measure

In [5]:
with open(PARAM_INTERESTINGNESS_MEASURE_PATH, "rb") as file:
    QF = pickle.load(file)

## Add the Empty Pattern to the Result Set

In [6]:
# create the pysubgroup object for the empty pattern with a representation for the dataset
sel_conjunction = util.from_str_Conjunction("Dataset")
subgroup = create_subgroup(test_data, sel_conjunction.selectors)

target = SoftClassifierTarget(dataset_meta.gt_name, dataset_meta.score_name)

statistics = QF.calculate_statistics(subgroup, target, test_data)
interestingness = QF.evaluate(subgroup, target, test_data, statistics)

if PARAM_FIXED_INTERESTINGNESS is not None:
    interestingness = PARAM_FIXED_INTERESTINGNESS

empty_pattern_row = {"interestingness": interestingness, "pattern": "Dataset"}

# order is important: overwrite None values with the values in empty_pattern_row, where possible
empty_pattern_row = {column: None for column in result_set.columns.values} | empty_pattern_row

result_set = pd.concat([result_set, pd.DataFrame(empty_pattern_row, index=[0])], ignore_index=True)

## Write the Augmentation Result

In [7]:
result_set.to_csv(f"{STAGE_OUTPUT_PATH}/{PARAM_RESULT_OUT_FILENAME_NO_SUFFIX}_{os.path.basename(PARAM_RESULT_SET_PATH)}", index=False)