# Manipulate Given Columns in Given Subgroups

## Default Values for Papermill Parameters

In [None]:
PARAM_DATA_IN_PATH = "../../data"
PARAM_DATA_OUT_PATH = "data/0.7_0.8"
PARAM_DATASET_NAME = "OpenML Adult"

PARAM_PATTERNS_IN_PATH = "../outputs/0.7_0.8_picked_pattern.csv"

PARAM_PERMUTATION_COLUMNS = ["score"]  # ["class"]
PARAM_INJECTION_PROCEDURE = "permute"  # "negate"

PARAM_SEED = 0

## Import and Set Parameters

In [None]:
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc import util

import pandas as pd
import numpy as np
import os

PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)
PARAM_PATTERNS_IN_PATH = util.prepend_experiment_output_path(PARAM_PATTERNS_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../..")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/{PARAM_DATA_OUT_PATH}"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_PREDICTED

# read data and preprocess it for the model
(train_data, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)
holdout_significance_data = dataset_reader._read_processed(dataset_meta, "model_predicted_holdout_significance.csv", ",")
holdout_generalizability_data = dataset_reader._read_processed(dataset_meta, "model_predicted_holdout_generalizability.csv", ",")

permutation_patterns = pd.read_csv(PARAM_PATTERNS_IN_PATH)

rng = np.random.default_rng(PARAM_SEED)

## Permute Subgroups in the Dataset

In [None]:
from subroc.datasets.contamination import permute_columns_subgroup, negate_columns_subgroup
from subroc.util import create_subgroup, from_str_Conjunction


def inject(data, pattern, split_name):
    columns_before_permutation = data[PARAM_PERMUTATION_COLUMNS].to_numpy()

    subgroup = create_subgroup(data, from_str_Conjunction(pattern).selectors)

    if PARAM_INJECTION_PROCEDURE == "permute":
        permute_columns_subgroup(data, PARAM_PERMUTATION_COLUMNS, subgroup, rng)
    elif PARAM_INJECTION_PROCEDURE == "negate":
        negate_columns_subgroup(data, PARAM_PERMUTATION_COLUMNS, subgroup)
    else:
        raise ValueError(f"Injection procedure {PARAM_INJECTION_PROCEDURE} not supported.")

    print("pattern:", pattern)
    print("split name:", split_name)
    if np.sum(subgroup.representation) != 0:
        print("ratio of changed values at fixed positions inside the subgroup:", np.sum(data[PARAM_PERMUTATION_COLUMNS].to_numpy() != columns_before_permutation) / np.sum(subgroup.representation))
    else:
        print("nothing changed, because the subgroup has an empty cover")
    print()


for pattern in list(permutation_patterns):
    inject(train_data, pattern, "Training")
    inject(test_data, pattern, "Search")
    inject(holdout_significance_data, pattern, "Holdout Significance")
    inject(holdout_generalizability_data, pattern, "Holdout Generalizability")

## Save the Result

In [30]:
out_path = DATA_OUT_PATH
if not os.path.exists(out_path):
    os.mkdir(out_path)

train_data.to_csv(out_path + "/" + "permuted_model_predicted_train.csv", index=False)
test_data.to_csv(out_path + "/" + "permuted_model_predicted_test.csv", index=False)
holdout_significance_data.to_csv(out_path + "/" + "permuted_model_predicted_holdout_significance.csv", index=False)
holdout_generalizability_data.to_csv(out_path + "/" + "permuted_model_predicted_holdout_generalizability.csv", index=False)