# Run and plot significance study

In [None]:
import sys

sys.path.append("/vol/biomedic3/mb121/calibration_exploration/")

from classification.load_model_and_config import (
    get_run_id_from_config,
    _clean_config_for_backward_compatibility,
)
from hydra import initialize, compose
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scikit_posthocs as sp
from collections import defaultdict
import pingouin

sns.set_style("whitegrid")

all_experiments = [
    "base_density",
    "base_camelyon",
    "base_retina",
    "base_living17",
    "base_entity30",
    "base_domainnet",
    "base_icam",
    "base_chexpert",
]

Load all models

In [None]:
pretrained = False
model_names = [
    "resnet18",
    "resnet50",
    "mobilenetv2_100",
    "convnext_tiny",
    "vit_base_patch16_224",
    "efficientnet_b0",
]  #

all_run_ids = defaultdict(list)
all_run_ids_ls = defaultdict(list)
all_run_ids_er = defaultdict(list)
all_run_ids_er_ls = defaultdict(list)
all_run_focal = defaultdict(list)

for experiment in all_experiments:
    configs_to_evaluate = [
        [
            f"experiment={experiment}",
            f"model.encoder_name={model}",
            f"model.pretrained={pretrained}",
        ]
        for model in model_names
    ]
    with initialize(version_base=None, config_path="../configs"):
        for config_str in configs_to_evaluate:
            config = compose(
                config_name="config.yaml",
                overrides=config_str + ["trainer.label_smoothing=0.00"],
            )
            delattr(config.trainer, "lr")
            _clean_config_for_backward_compatibility(config)
            run_id = get_run_id_from_config(
                config, allow_multiple_runs=False, allow_return_none_if_no_runs=True
            )
            if run_id is not None:
                all_run_ids[experiment[5:]].append(run_id)
            config = compose(
                config_name="config.yaml",
                overrides=config_str + ["trainer.label_smoothing=0.05"],
            )
            delattr(config.trainer, "lr")
            _clean_config_for_backward_compatibility(config)
            run_id_ls = get_run_id_from_config(
                config, allow_multiple_runs=False, allow_return_none_if_no_runs=True
            )
            if run_id_ls is not None:
                all_run_ids_ls[experiment[5:]].append(run_id_ls)

            config = compose(
                config_name="config.yaml",
                overrides=config_str + ["trainer.entropy_regularisation=0.1"],
            )
            delattr(config.trainer, "lr")
            _clean_config_for_backward_compatibility(config)
            run_id_er = get_run_id_from_config(
                config, allow_multiple_runs=False, allow_return_none_if_no_runs=True
            )
            if run_id_er is not None:
                all_run_ids_er[experiment[5:]].append(run_id_er)

            config = compose(
                config_name="config.yaml",
                overrides=config_str
                + [
                    "trainer.entropy_regularisation=0.1",
                    "trainer.label_smoothing=0.05",
                ],
            )
            delattr(config.trainer, "lr")
            _clean_config_for_backward_compatibility(config)
            run_id_er_ls = get_run_id_from_config(
                config, allow_multiple_runs=False, allow_return_none_if_no_runs=True
            )
            if run_id_er_ls is not None:
                all_run_ids_er_ls[experiment[5:]].append(run_id_er_ls)

            config = compose(
                config_name="config.yaml",
                overrides=config_str
                + ["trainer.use_focal_loss=True", "trainer.focal_loss_gamma=-53"],
            )
            delattr(config.trainer, "lr")
            _clean_config_for_backward_compatibility(config)
            run_id = get_run_id_from_config(
                config, allow_multiple_runs=False, allow_return_none_if_no_runs=True
            )
            if run_id is not None:
                all_run_focal[experiment[5:]].append(run_id)

In [None]:
def retrieve_metrics_df(list_run_ids, metric, experiment):
    all_df = []
    for run_id in list_run_ids:
        output_dir = Path(
            f"/vol/biomedic3/mb121/calibration_exploration/outputs/run_{run_id}"
        )
        try:
            df = pd.read_csv(output_dir / f"metrics_{metric}.csv")
        except FileNotFoundError:
            print(str(output_dir / f"metrics_{metric}.csv") + " Not found")
            continue
        df.rename(columns={"Unnamed: 0": "domain"}, inplace=True)
        if "brightness_s0" in df.domain.values:
            df["domain"] = df["domain"].apply(
                lambda x: (int(x[-1]) + 1) if x != "id" else "id"
            )
        df["domain"] = df["domain"].map(lambda x: "ID" if "id" == x else "OOD")
        df = df.groupby("domain").mean()
        # df['domain'] = df.index
        df["dataset"] = experiment
        df.reset_index(inplace=True)
        all_df.append(df)
    if len(all_df) == 0:
        return pd.DataFrame()
    return pd.concat(all_df)


all_dfs = {}
keys = ["CE", "LS", "ER", "ERLS", "Focal"]

metrics = ["ECE", "Brier"]
run_lists = [
    all_run_ids,
    all_run_ids_ls,
    all_run_ids_er,
    all_run_ids_er_ls,
    all_run_focal,
]
for k, run_list in zip(keys, run_lists):
    all_dfs[k] = defaultdict(list)
    for m in metrics:
        for experiment in all_experiments:
            all_dfs[k][m].append(
                retrieve_metrics_df(run_list[experiment[5:]], m, experiment[5:])
            )
        all_dfs[k][m] = pd.concat(all_dfs[k][m])

Run analysis

In [None]:
for d in ["OOD", "ID"]:
    for m in ["ECE"]:
        dfs_to_plot = []
        c2 = [
            "probas",
            "calib_ts",
            "calib_irm",
            "calib_irovats",
            "calib_ts_with_ood",
            "calib_irm_with_ood",
            "calib_ebs",
        ]
        cols = []
        for p in c2:
            cols.extend(
                [
                    f"{p.replace('calib_', '').upper() }_{k}"
                    for k in ["CE", "ERLS", "Focal", "LS", "ER"]
                ]
            )
        for k in ["CE", "ERLS", "Focal", "LS", "ER"]:
            df1 = all_dfs[k][m][["domain"] + c2]
            clean_name = {c: c.replace("calib_", "").upper() + f"_{k}" for c in c2}
            df1.rename(columns=clean_name, inplace=True)
            dfs_to_plot.append(df1)
        df = pd.concat(dfs_to_plot, axis=1)
        df = df.T.drop_duplicates().T
        df = df.loc[df.domain == d]

        # Conduct the Nemenyi post-hoc test
        df = sp.posthoc_nemenyi_friedman(df.drop(columns="domain"))
        df = df.loc[cols]
        df = df.T.loc[cols].T

        print(pingouin.friedman(df))

        f, ax = plt.subplots(1, 1, figsize=(8, 8))
        sns.heatmap(
            (df < 0.05),
            cbar=False,
            ax=ax,
            linecolor="grey",
            linewidth=0.00,
            cmap=["white", "navy"],
        )
        w = df.values.shape[1]

        for i in range(len(c2)):
            ax.axvline(x=i * 5, color="black", linewidth=1.2)
            ax.axhline(y=i * 5, color="black", linewidth=1.2)