# Generate the data for analytical plots about impact of size and class imbalance factors

## Default Values for Papermill Parameters

In [1]:
PARAM_NUM_SIZES = 99
PARAM_NUM_CLASS_BALANCES = 100
PARAM_NUM_CORRELATIONS = 100  # must be even
PARAM_SAMPLE_SIZE = 20

PARAM_ENABLE_PLOTS = "False"

PARAM_SEED = 0

## Prepare constant variables that will be used throughout the notebook.

In [2]:
from subroc.quality_functions.sklearn_metrics import get_soft_classification_metric_qf_dict
from subroc.quality_functions.base_qf import OptimizationMode
from subroc.quality_functions.custom_metrics import get_custom_metric_qf_dict
from subroc.quality_functions.soft_classifier_target import SoftClassifierTarget
from subroc.util import create_subgroup
from subroc import util

import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../outputs")

NP_RNG = np.random.default_rng(PARAM_SEED)

PARAM_ENABLE_PLOTS = util.str_to_bool(PARAM_ENABLE_PLOTS)

## Set QFs

In [3]:
base_qfs = [
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["average_ranking_loss"],
    get_soft_classification_metric_qf_dict(OptimizationMode.Maximal)["sklearn.metrics.roc_auc_score"],
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["prc_auc_score"],
]

size_factor_qfs = [
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["average_ranking_loss"],
    get_soft_classification_metric_qf_dict(OptimizationMode.Maximal)["sklearn.metrics.roc_auc_score"],
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["prc_auc_score"],
]

for qf in size_factor_qfs:
    qf.subgroup_size_weight = 1

class_imbalance_factor_qfs = [
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["average_ranking_loss"],
    get_soft_classification_metric_qf_dict(OptimizationMode.Maximal)["sklearn.metrics.roc_auc_score"],
    get_custom_metric_qf_dict(OptimizationMode.Maximal)["prc_auc_score"],
]

for qf in class_imbalance_factor_qfs:
    qf.subgroup_class_balance_weight = 1

QFS = [*base_qfs, *size_factor_qfs, *class_imbalance_factor_qfs]

## Generate the base data

In [None]:
PROB_COL = "prob"
CLASS_COL = "class"
PRED_COL = "pred"

# Similar to reference data generation in "When to consult precision-recall curves"
def generate_data(size: int, negative_class_ratio: float, correlation: float) -> pd.DataFrame:
    # Generate (size) many ground-truth-prediction pairs from bivariate normal distribution with theoretical correlation and given class balance [0.0 - 1.0]
    flip_preds = False
    if correlation < 0:
        correlation *= -1
        flip_preds = True
    
    mean = [0, 0]
    covariance_matrix = [[1, correlation], [correlation, 1]]
    
    data = pd.DataFrame(NP_RNG.multivariate_normal(mean, covariance_matrix, size=size), columns=[PROB_COL, PRED_COL])
    prob_quantile = np.quantile(data[PROB_COL], negative_class_ratio)
    data[CLASS_COL] = data[PROB_COL] > prob_quantile
    
    if sum(data[CLASS_COL]) == 0 or sum(data[CLASS_COL]) == len(data):
        data[CLASS_COL] = data[PROB_COL] <= min(data[PROB_COL])
        
    if flip_preds:
        data[PRED_COL] = -1 * data[PRED_COL]
    
    return data

## Define plot data generation functions for interestingness measures over varying size and correlation

In [5]:


def generate_size_plot_data(plot: bool = False) -> dict:
    data_per_qf = {}
    
    for qf in QFS:
        # compute a synthetic "dataset quality" for reference in relative qf computation based on median negative class ratio and correlation
        reference_score_sample = []
        for _ in range(PARAM_SAMPLE_SIZE):
            reference_data = generate_data(PARAM_NUM_SIZES+1, 0.5, 0)
            reference_target = SoftClassifierTarget(CLASS_COL, PRED_COL)
            qf.calculate_constant_statistics(reference_data, reference_target)
            reference_score_sample.append(qf.dataset_quality)
        synthetic_dataset_quality = np.average(reference_score_sample)
        print(f"Synthetic dataset quality: {synthetic_dataset_quality}")
        
        sizes = []
        correlations = []
        scores = []
        score_matrix = np.zeros([PARAM_NUM_CORRELATIONS, PARAM_NUM_SIZES])
        
        for i, size in enumerate(tqdm(range(2, PARAM_NUM_SIZES+2))):
            for j, correlation in enumerate(np.linspace(-1, 1, num=PARAM_NUM_CORRELATIONS)):
                score_sample = []
                for _ in range(PARAM_SAMPLE_SIZE):
                    data = generate_data(size, 0.5, correlation)

                    subgroup = create_subgroup(data, [])
                    target = SoftClassifierTarget(CLASS_COL, PRED_COL)

                    qf.calculate_constant_statistics(data, target)
                    qf.dataset_quality = synthetic_dataset_quality
                    statistics = qf.calculate_statistics(subgroup, target, data)
                    score = qf.evaluate(subgroup, target, data, statistics)
                    score_sample.append(score)
                score = np.average(score_sample)
                
                sizes.append(size)
                correlations.append(correlation)
                scores.append(score)
                score_matrix[PARAM_NUM_CORRELATIONS - j - 1, i] = score

        qf_string = f"{qf.optimization_mode} {qf.__name__} {qf.subgroup_size_weight} {qf.subgroup_class_balance_weight}"
        data_per_qf[qf_string] = pd.DataFrame({"size": sizes, "correlation": correlations, "score": scores})
         
        if plot:
            plt.matshow(score_matrix)
            plt.title(qf_string)
            plt.xlabel("Size")
            plt.ylabel("Correlation")
            plt.colorbar()
            plt.show()
        
    return data_per_qf


def generate_class_balance_plot_data(plot: bool = False) -> dict:
    data_per_qf = {}

    for qf in QFS:
        # compute a synthetic "dataset quality" for reference in relative qf computation based on median negative class ratio and correlation
        reference_score_sample = []
        for _ in range(PARAM_SAMPLE_SIZE):
            reference_data = generate_data(PARAM_NUM_SIZES+1, 0.5, 0)
            reference_target = SoftClassifierTarget(CLASS_COL, PRED_COL)
            qf.calculate_constant_statistics(reference_data, reference_target)
            reference_score_sample.append(qf.dataset_quality)
        synthetic_dataset_quality = np.average(reference_score_sample)
        print(f"Synthetic dataset quality: {synthetic_dataset_quality}")

        negative_class_ratios = []
        correlations = []
        scores = []
        score_matrix = np.zeros([PARAM_NUM_CORRELATIONS, PARAM_NUM_CLASS_BALANCES])

        for i, negative_class_ratio in enumerate(tqdm(np.linspace(0, 1, num=PARAM_NUM_CLASS_BALANCES+1, endpoint=False)[1:])):
            for j, correlation in enumerate(np.linspace(-1, 1, num=PARAM_NUM_CORRELATIONS)):
                score_sample = []
                for _ in range(PARAM_SAMPLE_SIZE):
                    data = generate_data(100, negative_class_ratio, correlation)

                    subgroup = create_subgroup(data, [])
                    target = SoftClassifierTarget(CLASS_COL, PRED_COL)

                    qf.calculate_constant_statistics(data, target)
                    qf.dataset_quality = synthetic_dataset_quality
                    statistics = qf.calculate_statistics(subgroup, target, data)
                    score = qf.evaluate(subgroup, target, data, statistics)
                    score_sample.append(score)
                score = np.average(score_sample)

                negative_class_ratios.append(negative_class_ratio)
                correlations.append(correlation)
                scores.append(score)
                score_matrix[PARAM_NUM_CORRELATIONS - j - 1, i] = score

        qf_string = f"{qf.optimization_mode} {qf.__name__} {qf.subgroup_size_weight} {qf.subgroup_class_balance_weight}"
        data_per_qf[qf_string] = pd.DataFrame({"negative_class_ratio": negative_class_ratios, "correlation": correlations, "score": scores})

        if plot:
            plt.matshow(score_matrix)
            plt.title(qf_string)
            plt.xlabel("Negative Class Ratio")
            plt.ylabel("Correlation")
            plt.colorbar()
            plt.show()

    return data_per_qf



## Generate plot data for interestingness measures over varying size and correlation

In [None]:
size_plot_data_per_qf = generate_size_plot_data(plot=PARAM_ENABLE_PLOTS)

## Generate plot data for interestingness measures over varying size and correlation

In [None]:
class_balance_plot_data_per_qf = generate_class_balance_plot_data(plot=PARAM_ENABLE_PLOTS)

## Save the results

In [None]:
for qf in QFS:
    qf_string = f"{qf.optimization_mode} {qf.__name__} {qf.subgroup_size_weight} {qf.subgroup_class_balance_weight}"
    
    size_plot_data = size_plot_data_per_qf[qf_string]
    class_balance_plot_data = class_balance_plot_data_per_qf[qf_string]

    qf_optimization_mode_file_string = str(qf.optimization_mode).split(".")[-1].lower()
    size_plot_data.to_csv(STAGE_OUTPUT_PATH + "/" + f"result_factors_analytical_size_{qf.__name__}_{qf_optimization_mode_file_string}_s{qf.subgroup_size_weight}_cb{qf.subgroup_class_balance_weight}.csv", index=False)
    class_balance_plot_data.to_csv(STAGE_OUTPUT_PATH + "/" + f"result_factors_analytical_class_balance_{qf.__name__}_{qf_optimization_mode_file_string}_s{qf.subgroup_size_weight}_cb{qf.subgroup_class_balance_weight}.csv", index=False)

# Print Process Status Info

In [None]:
util.print_proc_status()