# Filtering a Result Set Based on the Statistical Significance of Each Subgroup
## Statistical Significance is Determined by Random Sampling

## Default Values for Papermill Parameters

In [None]:
PARAM_RESULT_SET_PATH = "../outputs/p_value_augmented_result_set.csv"
PARAM_FILTERING_RESULT_FILENAME = "p_value_filtered_result_set.csv"

PARAM_SIGNIFICANCE_ALPHA = 0.05
PARAM_MULTIPLE_TESTING_CORRECTION = "bonferroni"
PARAM_FILTER_NAN_P_VALUES = "True"
PARAM_FILTER_NEGATIVE_FILTERING_INTERESTINGNESS = "True"
PARAM_REMOVE_FILTERING_ATTRIBUTES = "False"

## Import and Set Parameters

In [None]:
from subroc import util

import pandas as pd
import numpy as np
import os
from statsmodels.stats.multitest import multipletests

# fill environment variables into params
PARAM_RESULT_SET_PATH = util.prepend_experiment_output_path(PARAM_RESULT_SET_PATH)

# get environment variables
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

PARAM_FILTER_NAN_P_VALUES = util.str_to_bool(PARAM_FILTER_NAN_P_VALUES)
PARAM_FILTER_NEGATIVE_FILTERING_INTERESTINGNESS = util.str_to_bool(PARAM_FILTER_NEGATIVE_FILTERING_INTERESTINGNESS)
PARAM_REMOVE_FILTERING_ATTRIBUTES = util.str_to_bool(PARAM_REMOVE_FILTERING_ATTRIBUTES)

## Read the Result Set

In [None]:
result_set = pd.read_csv(f"{PARAM_RESULT_SET_PATH}")
result_set

## Filter the Result Set

In [None]:
original_columns = result_set.columns.values.tolist()

filtering_attributes = ["p-value", "filtering interestingness"]
if not PARAM_REMOVE_FILTERING_ATTRIBUTES:
    filtered_result_set = pd.DataFrame(columns=original_columns + ["corrected p-value"])
else:
    filtered_result_set = pd.DataFrame(columns=[original_column for original_column in original_columns if original_column not in filtering_attributes])

all_patterns = result_set["pattern"].tolist()
all_pvalues = np.array(result_set["p-value"].tolist())

# Multiple Testing Correction Method
if len(result_set) > 0:
    if PARAM_FILTER_NAN_P_VALUES:
        no_nan_mask = np.logical_not(np.isnan(all_pvalues))
        no_nan_pvalues = all_pvalues[no_nan_mask]
        no_nan_reject, no_nan_pvals_corrected, _, _ = multipletests(no_nan_pvalues, PARAM_SIGNIFICANCE_ALPHA, PARAM_MULTIPLE_TESTING_CORRECTION)

        pvals_corrected = np.array([np.nan for _ in range(len(all_pvalues))])
        pvals_corrected[no_nan_mask] = no_nan_pvals_corrected
        reject = np.array([False for _ in range(len(all_pvalues))])
        reject[no_nan_mask] = no_nan_reject
    else:
        reject, pvals_corrected, _, _ = multipletests(all_pvalues, PARAM_SIGNIFICANCE_ALPHA, PARAM_MULTIPLE_TESTING_CORRECTION)

for i in range(len(result_set)):
    row = result_set.iloc[i]
    pattern = row["pattern"]
    filtering_interestingness = row["filtering interestingness"]
    pvalue = row["p-value"]
    pvalue_corrected = pvals_corrected[i]
    curr_reject = reject[i]

    print(f"pattern: {pattern}")
    print(f"filtering interestingness: {filtering_interestingness}")
    print(f"p-value: {pvalue}")
    print(f"corrected p-value: {pvalue_corrected}")
    print(f"rejected H_0: {'Yes' if curr_reject else 'No'}")

    # compare the p-value against the critical value
    if (PARAM_FILTER_NEGATIVE_FILTERING_INTERESTINGNESS and filtering_interestingness <= 0) or \
            (PARAM_FILTER_NAN_P_VALUES and np.isnan(pvalue)) or \
            not curr_reject:
        continue
    
    # append the augmented instance to metrics_augmented_result_set
    if not PARAM_REMOVE_FILTERING_ATTRIBUTES:
        filtered_result_set.loc[i] = [row[column] for column in original_columns] + [pvalue_corrected]
    else:
        filtered_result_set.loc[i] = [row[column] for column in original_columns if column not in filtering_attributes]

    print()

## Write the Filtered Result Set

In [None]:
filtered_result_set.to_csv(f"{STAGE_OUTPUT_PATH}/{PARAM_FILTERING_RESULT_FILENAME}", index=False)