# Compute Subgroup p-values based on Random Sampling

## Default Values for Papermill Parameters

In [None]:
PARAM_RESULT_SET_PATH = "../outputs/sd_result_set_average_ranking_loss.csv"
PARAM_QF_PATH = "../outputs/interestingness_measure.pickle"
PARAM_QF_OUTPUT_BASENAME = "p_value_augmentation_interestingness_measure"
PARAM_FILTERING_RESULT_FILENAME = "p_value_augmented_result_set.csv"
PARAM_DATA_IN_PATH = "../../data"
PARAM_MODELS_IN_PATH = "../../models"

PARAM_DATASET_NAME = "OpenML Adult"
PARAM_DATASET_STAGE = None
PARAM_MODEL_NAME = "sklearn_gaussian_nb_adult_4_splits"
PARAM_NUM_RANDOM_SAMPLES = 10000

## Import and Set Parameters

In [None]:
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage, meta_dict
from subroc.model_serialization import deserialize
from subroc.quality_functions.base_qf import PredictionType
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc import util

import pysubgroup as ps
import pandas as pd
import numpy as np
import pickle
import os

# fill environment variables into params
PARAM_RESULT_SET_PATH = util.prepend_experiment_output_path(PARAM_RESULT_SET_PATH)
PARAM_QF_PATH = util.prepend_experiment_output_path(PARAM_QF_PATH)
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)
PARAM_MODELS_IN_PATH = util.prepend_experiment_output_path(PARAM_MODELS_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

if PARAM_DATASET_STAGE is None:
    DATASET_STAGE = DatasetStage.PROCESSED_MODEL_READY
else:
    DATASET_STAGE = DatasetStage(PARAM_DATASET_STAGE)

# Model
model = deserialize(PARAM_MODELS_IN_PATH, PARAM_MODEL_NAME)

## Get and Preprocess the Data

In [None]:
# read data and preprocess it for the model
dataset_meta = meta_dict[DATASET_NAME]

# prepare classification predictions
dataset_meta.prediction_type = PredictionType.CLASSIFICATION_SOFT

data = None
if DATASET_STAGE == DatasetStage.PROCESSED_MODEL_READY:
    data = dataset_reader._read_processed(dataset_meta, "model_ready_holdout_significance.csv", ",")

    data_x = data.loc[:, data.columns != dataset_meta.gt_name]
    data[dataset_meta.score_name] = model.predict(data_x)

    # save data with predictions
    out_path = DATA_OUT_PATH + "/" + dataset_meta.dataset_dir
    if not os.path.exists(out_path):
        os.mkdir(out_path)

    data.to_csv(out_path + "/" + "model_predicted_holdout_significance.csv", index=False)
elif DATASET_STAGE == DatasetStage.PROCESSED_MODEL_PREDICTED:
    data = dataset_reader._read_processed(dataset_meta, "model_predicted_holdout_significance.csv", ",")
elif DATASET_STAGE == DatasetStage.PROCESSED_PERMUTED_MODEL_PREDICTED:
    data = dataset_reader._read_processed(dataset_meta, "permuted_model_predicted_holdout_significance.csv", ",")

# sd objects
target = SoftClassifierTarget(dataset_meta.gt_name, dataset_meta.score_name)

## Read the Result Set

In [None]:
try:
    result_set = pd.read_csv(f"{PARAM_RESULT_SET_PATH}")
except pd.errors.EmptyDataError:
    result_set = pd.DataFrame()

## Read and Configure the Interestingness Measure

In [None]:
with open(PARAM_QF_PATH, "rb") as qf_file:
    QF = pickle.load(qf_file)

if isinstance(QF, ps.GeneralizationAwareQF):
    QF = QF.qf

QF.calculate_constant_statistics(data, target)

# Disable any significance-related changes to the qf value
QF.subgroup_size_weight = 0
QF.subgroup_class_balance_weight = 0
QF.random_sampling_p_value_factor = False
QF.random_sampling_normalization = False

# Configure the QF for computing p-values by random sampling
QF.num_random_samples = PARAM_NUM_RANDOM_SAMPLES
QF.random_sampling_distributions = {}

# update the representation of the qf-specific constraints if necessary
if hasattr(QF, "constraints"):
    for constraint in QF.constraints:
        if hasattr(constraint, "update"):
            constraint.update(data)

## Augment the Result Set (Compute p-values)

In [None]:
original_columns = result_set.columns.values.tolist()
augmented_result_set = pd.DataFrame(columns=original_columns + ["p-value", "filtering interestingness"])

all_patterns = [result.pattern for result in result_set.itertuples()]

for i, result in enumerate(result_set.itertuples()):
    print(result.pattern)

    # recreate the pysubgroup object for the subgroup with a representation for the dataset
    sel_conjunction = util.from_str_Conjunction(result.pattern)
    subgroup = util.create_subgroup(data, sel_conjunction.selectors)
    
    statistics = QF.calculate_statistics(subgroup, target, data)

    # shortcut for when qf constraints are not met
    if not ps.constraints_satisfied(
            QF.constraints,
            subgroup,
            statistics,
            data,
    ):
        augmented_result_set.loc[i] = [result[i+1] for i in range(len(original_columns))] + [np.nan, np.nan]
        continue

    # compute the qf value (without significance)
    qf_value = QF.evaluate(subgroup, target, data, statistics)

    # compute the p-value
    subgroup_labels = data.loc[subgroup.representation, target.gt_name]
    subgroup_size = len(subgroup_labels)
    p_value = QF._compute_random_sampling_p_value(subgroup_size, subgroup_labels, target, data, qf_value)

    print(f"p-value: {p_value}")
    print(f"filtering interestingness: {qf_value}")
    
    # append the augmented instance to metrics_augmented_result_set
    augmented_result_set.loc[i] = [result[i+1] for i in range(len(original_columns))] + [p_value, qf_value]

## Write the Augmented Result Set

In [None]:
augmented_result_set.to_csv(f"{STAGE_OUTPUT_PATH}/{PARAM_FILTERING_RESULT_FILENAME}", index=False)

## Save the Interestingness Measure

In [None]:
with open(STAGE_OUTPUT_PATH + "/" + PARAM_QF_OUTPUT_BASENAME + ".pickle", "wb") as file:
    pickle.dump(QF, file)