# Find Subgroups in a Given Size Range on the Search Split of a Dataset

## Default Values for Papermill Parameters

In [8]:
PARAM_DATA_IN_PATH = "../../data"
PARAM_DATASET_NAME = "OpenML Adult"

PARAM_MIN_SIZE_FRACTION = 0.7  # 0.2, 0.45, 0.7
PARAM_MAX_SIZE_FRACTION = 0.8  # 0.3, 0.55, 0.8
PARAM_DEPTH = 3

PARAM_PATTERNS_OUT_FILENAME = f"/{PARAM_MIN_SIZE_FRACTION}_{PARAM_MAX_SIZE_FRACTION}_picked_pattern.csv"

PARAM_SEED = 0

## Import and Set Parameters

In [15]:
from subroc.datasets.metadata import to_DatasetName, meta_dict
from subroc.datasets.reader import DatasetReader, DatasetStage
from subroc import util
import pysubgroup as ps
import numpy as np
import os

PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

# Dataset
dataset_reader = DatasetReader(PARAM_DATA_IN_PATH)

DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

if DATASET_NAME is None:
    print(f"dataset name '{PARAM_DATASET_NAME}' not supported.")

dataset_meta = meta_dict[DATASET_NAME]

DATASET_STAGE = DatasetStage.PROCESSED_MODEL_READY

# read data and preprocess it for the model
(_, test_data), dataset_meta = dataset_reader.read_dataset(DATASET_NAME, DATASET_STAGE)

rng = np.random.default_rng(PARAM_SEED)

## Define Selectors (Search Space)

In [None]:
from subroc.selectors import create_selectors

IGNORE_NULL = True
intermediate_search_space = create_selectors(
    test_data,
    nbins=10,
    ignore=[dataset_meta.gt_name, dataset_meta.score_name],
    ignore_null=IGNORE_NULL
)

# Dummy attributes contain values that correspond to inequality of an original nominal value.
# Equality selectors with such values effectively define inequality selectors with the original nominal values.
# They are therefore removed.
SEARCH_SPACE = []
for selector in intermediate_search_space:
    if isinstance(selector, ps.EqualitySelector) and not selector.attribute_value:
        continue
    
    SEARCH_SPACE.append(selector)

## Generate Patterns and Compute Cover Sizes

In [None]:
from subroc.util import create_subgroup, from_str_Conjunction

import itertools
from tqdm.notebook import tqdm

all_patterns = np.array([" AND ".join([str(sel) for sel in selectors]) for depth in tqdm(range(1, PARAM_DEPTH+1)) for selectors in list(itertools.combinations(SEARCH_SPACE, depth))])
print(all_patterns)
all_cover_sizes = np.array([np.sum(create_subgroup(test_data, from_str_Conjunction(pattern).selectors).representation) for pattern in tqdm(all_patterns)])

## Filter Patterns from Search Space

In [None]:
filtered_patterns_idx = []

for idx in tqdm(range(len(all_patterns))):
    if PARAM_MIN_SIZE_FRACTION*len(test_data) <= all_cover_sizes[idx] <= PARAM_MAX_SIZE_FRACTION*len(test_data):
        filtered_patterns_idx.append(idx)

all_patterns[filtered_patterns_idx]

## Histogram of Remaining Subgroup Sizes

In [None]:
import matplotlib.pyplot as plt

# plt.hist(all_cover_sizes, bins=100, range=(0, len(test_data)))
plt.hist(all_cover_sizes[filtered_patterns_idx], bins=100, range=(0, len(test_data)))
plt.axvline(PARAM_MIN_SIZE_FRACTION*len(test_data), c="red")
plt.axvline(PARAM_MAX_SIZE_FRACTION*len(test_data), c="red")

## Pick a Filtered Subgroup Randomly

In [None]:
import pandas as pd

picked_pattern = rng.choice(all_patterns[filtered_patterns_idx])
print(picked_pattern)

pd.DataFrame([picked_pattern]).to_csv(STAGE_OUTPUT_PATH + "/" + PARAM_PATTERNS_OUT_FILENAME, header=None, index=False)