In this notebook, the metric score results are aggregated into one file per metric

In [None]:
import pickle
import re

from src.utils.utils_data_formatter import save_dict_to_text

Starting with Complexity for Winequality data

In [None]:
def load_results(pickle_path):
    """Load dict from pickle file. 
    """
    with open(pickle_path, 'rb') as f:
        return pickle.load(f)

load the results obtained by running evaluate_mnist_dropout.py/evaluate_mnist_dropconnect.py

In [37]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/Complexity/dropout/2026-01-14/08-10-52/evaluation/cross_validation/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")

In [38]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/Complexity/dropconnect/2026-01-04/17-06-16/evaluation/cross_validation/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")

In [None]:
def aggregate_cross_val_dicts(named_dicts, metric="Complexity"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"mean": [], "std": [], "all_values": []}

                # Ensure missing values become None instead of raising an error
                mean_val = metric_block.get("mean")
                std_val = metric_block.get("std")
                all_vals = metric_block.get("all_values")

                out[name][xai_name]["mean"].append(mean_val)
                out[name][xai_name]["std"].append(std_val)
                out[name][xai_name]["all_values"].append(all_vals)

    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts(named, metric="Complexity")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/Complexity", file_name ="complexity_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/Complexity/complexity_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)


Repeatability

In [42]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/Determinism/dropout/2026-01-03/23-28-10/evaluation/cross_validation/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")  

In [43]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/Determinism/dropconnect/2026-01-05/21-13-26/evaluation/cross_validation/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")


In [None]:
def aggregate_cross_val_dicts_cosine_spearman(named_dicts, metric="DeterministicCheck"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"mean_cosine": [], "std_cosine": [], "mean_spearman": [], "std_spearman": [],"all_values_cosine": [], "all_values_spearman": []}

                # Ensure missing values become None instead of raising an error
                mean_val_cosine = metric_block.get("mean_cosine")
                std_val_cosine = metric_block.get("std_cosine")
                mean_val_spearman = metric_block.get("mean_spearman")
                std_val_spearman = metric_block.get("std_spearman")
                all_vals = metric_block.get("all_values")
                all_vals_cosine = all_vals[1]
                all_vals_spearman = all_vals[0]

                out[name][xai_name]["mean_cosine"].append(mean_val_cosine)
                out[name][xai_name]["std_cosine"].append(std_val_cosine)
                out[name][xai_name]["mean_spearman"].append(mean_val_spearman)
                out[name][xai_name]["std_spearman"].append(std_val_spearman)

                out[name][xai_name]["all_values_cosine"].append(all_vals_cosine)
                out[name][xai_name]["all_values_spearman"].append(all_vals_spearman)

    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts_cosine_spearman(named, metric="DeterministicCheck")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/Determinism", file_name ="determinism_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/Determinism/determinism_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)

Feature Flipping

In [47]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/FeatureFlipping/top100/dropout/Feature_Flipping_top100/2026-01-06/17-05-14/evaluation/cross_validation/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")  


In [48]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/FeatureFlipping/top100/dropconnect/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")

In [None]:
def aggregate_cross_val_dicts_featureflipping(named_dicts, metric="FeatureFlipping"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"auc_mean": [], "auc_stds": [], "all_values": []}

                # Ensure missing values become None instead of raising an error
                mean_val = metric_block.get("auc_mean")
                std_val = metric_block.get("auc_stds")
                all_vals = metric_block.get("all_values")

                out[name][xai_name]["auc_mean"].append(mean_val)
                out[name][xai_name]["auc_stds"].append(std_val)
                out[name][xai_name]["all_values"].append(all_vals)

    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts_featureflipping(named, metric="FeatureFlipping")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/FeatureFlipping", file_name ="feature_flipping_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/FeatureFlipping/feature_flipping_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)

Relative Input Stability

In [51]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeInputStability/dropout/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")  


In [52]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeInputStability/dropconnect/2026-01-04/21-03-49/evaluation/cross_validation/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")

In [None]:
def aggregate_cross_val_dicts_ris(named_dicts, metric="RelativeInputStability"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"RIS_mean": [], "RIS_std": [], "all_values": [], "nr_perturbations": []}

                # Ensure missing values become None instead of raising an error
                mean_val = metric_block.get("RIS_mean")
                std_val = metric_block.get("RIS_std")
                all_list = metric_block.get("all_values")
                all_vals = all_list[0]
                nr_perturbations = all_list[1]

                out[name][xai_name]["RIS_mean"].append(mean_val)
                out[name][xai_name]["RIS_std"].append(std_val)
                out[name][xai_name]["all_values"].append(all_vals)
                out[name][xai_name]["nr_perturbations"].append(nr_perturbations)
    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts_ris(named, metric="RelativeInputStability")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeInputStability", file_name ="relative_input_stability_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeInputStability/relative_input_stability_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)

Relative Rank Improvement

In [55]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeRankImprovement/dropout/2026-01-03/11-37-19/evaluation/cross_validation/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")  

In [56]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeRankImprovement/dropconnect/2026-01-05/11-14-51/evaluation/cross_validation/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")

In [None]:
def aggregate_cross_val_dicts_rri(named_dicts, metric="Relative Rank Improvement"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"avg_unc_rank_change": [], "accuracy": [], "ranks": [], "rank_changes": []}

                # Ensure missing values become None instead of raising an error
                rank_change = metric_block.get("avg_unc_rank_change")
                acc = metric_block.get("accuracy")
                all_list = metric_block.get("all_values")
                ranks = all_list[0]
                rank_changes = all_list[1]

                out[name][xai_name]["avg_unc_rank_change"].append(rank_change)
                out[name][xai_name]["accuracy"].append(acc)
                out[name][xai_name]["ranks"].append(ranks)
                out[name][xai_name]["rank_changes"].append(rank_changes)
    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts_rri(named, metric="Relative Rank Improvement")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeRankImprovement", file_name ="relative_rank_improvement_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/RelativeRankImprovement/relative_rank_improvement_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)

Uncertainty Conveyance Similarity

In [59]:
dropout_02 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/Complexity/dropout/2026-01-14/08-10-52/evaluation/cross_validation/100_samples/dropout/layers_None/drop_prob0.2/cross_val_metrics_mnist.pkl")  

In [60]:
dropconnect_01 = load_results("/workspaces/expainable-uncertainty-quantification/results_final_mnist/UncertaintyConveyanceSimilarity/dropconnect/2026-01-04/17-06-16/evaluation/cross_validation/100_samples/dropconnect/layers_1/drop_prob0.1/cross_val_metrics_mnist.pkl")

In [None]:
def aggregate_cross_val_dicts_ucs(named_dicts, metric="Uncertainty Conveyance Similarity"):
    """
    Aggregate multiple cross-validation dicts into the desired format.

    Args:
      named_dicts (dict): Mapping name -> cross-validation-dict
          Example key: "dropconnect_005" or "dropout_01"
          Value: dict with keys "fold_1", "fold_2",
      metric (str): Name of the metric, default "Complexity".

    Returns:
      dict: Aggregated dict with structure
        {uq_name: { xai_method: {"mean":[...], "std":[...], "all_values":[ [...], [...], ... ] } } }
    """
    out = {}
    for name, cv_dict in named_dicts.items():
        out[name] = {}
        # Sort folds (fold_1, fold_2, ...) — numerically by the digit
        folds = sorted(
            (k for k in cv_dict.keys()),
            key=lambda s: int(re.search(r"\d+", s).group()) if re.search(r"\d+", s) else s # type: ignore
        )

        for fold in folds:
            methods = cv_dict.get(fold, {})
            if not isinstance(methods, dict):
                continue
            for xai_name, metrics_map in methods.items():
                if not isinstance(metrics_map, dict):
                    continue
                metric_block = metrics_map.get(metric)
                if not metric_block:
                    # if the requested metric is missing for this xai/fold, skip
                    continue

                if xai_name not in out[name]:
                    out[name][xai_name] = {"mean_cosine": [], "std_cosine": [], "mean_spearman": [], "std_spearman": [],"all_values_cosine": [], "all_values_spearman": []}

                # Ensure missing values become None instead of raising an error
                mean_val_cosine = metric_block.get("cosine_mean")
                std_val_cosine = metric_block.get("cosine_std")
                mean_val_spearman = metric_block.get("spearmanr_mean")
                std_val_spearman = metric_block.get("spearmanr_std")
                all_vals = metric_block.get("all_values")
                all_vals_cosine = all_vals[1]
                all_vals_spearman = all_vals[0]

                out[name][xai_name]["mean_cosine"].append(mean_val_cosine)
                out[name][xai_name]["std_cosine"].append(std_val_cosine)
                out[name][xai_name]["mean_spearman"].append(mean_val_spearman)
                out[name][xai_name]["std_spearman"].append(std_val_spearman)

                out[name][xai_name]["all_values_cosine"].append(all_vals_cosine)
                out[name][xai_name]["all_values_spearman"].append(all_vals_spearman)

    return out

In [None]:
named = {
    "dropout_02": dropout_02,
    "dropconnect_01": dropconnect_01,
}

aggregated = aggregate_cross_val_dicts_ucs(named, metric="Uncertainty Conveyance Similarity")
save_dict_to_text(aggregated, save_dir= "/workspaces/expainable-uncertainty-quantification/results_final_mnist/UncertaintyConveyanceSimilarity", file_name ="uncertainty_conveyance_similarity_aggregated")
out_path = "/workspaces/expainable-uncertainty-quantification/results_final_mnist/UncertaintyConveyanceSimilarity/uncertainty_conveyance_similarity_aggregated.pkl"
with open(out_path, "wb") as f:
    pickle.dump(aggregated, f, protocol=pickle.HIGHEST_PROTOCOL)