## Notebook for Preparation of Evaluation Data for Generation of Figures

Import packages

In [9]:
import json
import os
import pickle

import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

Initiate variables

In [10]:
# Evaluation measures
sr_eval = ["crps", "me", "lgt", "cov"]

# Skill scores
sr_skill = ["crps"]

# Names of aggregation methods
agg_names = {
    "lp": "Linear Pool",
    "vi": "Vincentization",
    "vi-a": "Vincentization (a)",
    "vi-w": "Vincentization (w)",
    "vi-aw": "Vincentization (a, w)",
}

# Aggregation methods
agg_meths = list(agg_names.keys())

# Names of datasets
set_names = {
    "gusts": "Wind",
    "scen_1": "Scenario 1",
    "scen_4": "Scenario 2",
    "protein": "Protein",
    "naval": "Naval",
    "power": "Power",
    "kin8nm": "Kin8nm",
    "wine": "Wine",
    "concrete": "Concrete",
    "energy": "Energy",
    "boston": "Boston",
    "yacht": "Yacht",
}

Get configuration (Requires to set GIT-Path!)

In [11]:
# Set GIT-path
git_path = "path_to_repo"

# Configuration function
def _get_config_info():
    ### Get Config ###
    with open(f"{git_path}/src/config_eval.json", "rb") as f:
        CONFIG = json.load(f)
    
    # Ensemble method
    ens_method = CONFIG["ENS_METHOD"]

    # Get available ensemble methods
    ens_method_ls = CONFIG["_available_ENS_METHOD"]
    
    # Path for figures
    plot_path = os.path.join(CONFIG["PATHS"]["PLOTS_DIR"], ens_method)

    # Path of data
    data_path = os.path.join(
        CONFIG["PATHS"]["DATA_DIR"],
        CONFIG["PATHS"]["RESULTS_DIR"],
        "dataset",
        ens_method,
    )

    # Path of network ensemble data
    data_ens_path = os.path.join(
        CONFIG["PATHS"]["DATA_DIR"],
        CONFIG["PATHS"]["RESULTS_DIR"],
        "dataset",
        ens_method,
        CONFIG["PATHS"]["ENSEMBLE_F"],
    )

    # Path of aggregated network data
    data_agg_path = os.path.join(
        CONFIG["PATHS"]["DATA_DIR"],
        CONFIG["PATHS"]["RESULTS_DIR"],
        "dataset",
        ens_method,
        CONFIG["PATHS"]["AGG_F"],
    )

    # Models considered
    dataset_ls = CONFIG["DATASET"]

    # Number of simulations
    n_sim = CONFIG["PARAMS"]["N_SIM"]

    # Ensemble size
    n_ens = CONFIG["PARAMS"]["N_ENS"]
    
    # Vector of ensemble members
    step_size = 2
    n_ens_vec = np.arange(
        start=step_size, stop=n_ens + step_size, step=step_size
    )

    # Network variants
    nn_vec = CONFIG["PARAMS"]["NN_VEC"]

    return (
        ens_method,
        ens_method_ls,
        plot_path,
        data_path,
        data_ens_path,
        data_agg_path,
        dataset_ls,
        n_sim,
        n_ens,
        n_ens_vec,
        nn_vec,
    )

In [12]:
(
        ens_method,
        ens_method_ls,
        plot_path,
        data_path,
        data_ens_path,
        data_agg_path,
        dataset_ls,
        n_sim,
        n_ens,
        n_ens_vec,
        nn_vec,
    ) = _get_config_info()

Create data for figures

In [13]:
# Initiate new data frame
df_plot = pd.DataFrame()

# For-Loop over data sets, ensembling strategies
for dataset in dataset_ls:
    # Console output
    print(dataset)
    
    # For-Loop over ensembling strategies
    for ens_method in ens_method_ls:
        # Replace ensemble method (required as data_path depends on last ens. method called)
        for temp_ens_method in ens_method_ls:
            data_path = data_path.replace(temp_ens_method, ens_method)
        
        # Load scores
        filename = f"eval_{dataset}_{ens_method}.pkl"
        temp_data_path = data_path.replace("dataset", dataset)
        with open(os.path.join(temp_data_path, filename), "rb") as f:
            df_scores = pickle.load(f)
        
        # Load diversity measures
        filename = f"diversity_{dataset}_{ens_method}.pkl"
        temp_data_path = data_path.replace("dataset", dataset)
        with open(os.path.join(temp_data_path, filename), "rb") as f:
            df_diversity = pickle.load(f)

        # Get number of simulations
        n_sim_set = df_scores["n_sim"].max() + 1
        
        # For-Loop over network variants, aggregation methods
        for temp_nn in nn_vec:
            # Only scenario and network
            df_sc = df_scores[(df_scores["model"] == dataset)
                              & (df_scores["nn"] == temp_nn)]
            
            # Only scenario and network
            df_div_nn = df_diversity[(df_diversity["model"] == dataset)
                              & (df_diversity["nn"] == temp_nn)]

            # Dataframe for rank calculation
            df_rank = df_sc[df_sc["type"] != "ind"]

            # Get rank of method
            df_rank.loc[:,"rank"] = df_rank.groupby(['n_sim', 'n_ens'], group_keys=False)[["crps"]].rank(method='min')
                
            # Get subset of scores data
            df_ens0 = df_sc[(df_sc["type"] == "ind")]
            
            # For-Loop over aggregation methods
            for temp_agg in agg_meths:
                # Get subset of scores data
                df_agg0 = df_sc[(df_sc["type"] == temp_agg)]
                df_rank0 = df_rank[(df_rank["type"] == temp_agg)]

                # For-Loop over number of ensemble member
                for i_ens in n_ens_vec:
                    # Subsets
                    df_ens = df_ens0[(df_ens0["n_rep"] < i_ens)]
                    df_agg = df_agg0[(df_agg0["n_ens"] == i_ens)]
                    df_rank_sub = df_rank0[(df_rank0["n_ens"] == i_ens)]
                    df_div_ens = df_div_nn[(df_div_nn["n_ens"] == i_ens)]

                    # For-Loop over number of partitions
                    for i_sim in range(n_sim_set):
                        # Get subsets
                        df_ens_sim = df_ens[(df_ens["n_sim"] == i_sim)]
                        df_agg_sim = df_agg[(df_agg["n_sim"] == i_sim)]
                        df_div_sub = df_div_ens[(df_div_ens["n_sim"] == i_sim)]
                        
                        # New rows
                        new_row = {
                            "dataset": set_names[dataset],
                            "i_set": dataset_ls.index(dataset),
                            "ens_method": ens_method,
                            "i_method": ens_method_ls.index(ens_method),
                            "nn": temp_nn.upper(),
                            "agg": temp_agg,
                            "n_ens": i_ens,
                            "n_sim": i_sim,
                            "crps": df_agg_sim["crps"].iloc[0],
                            "crps_ref": np.mean(df_ens_sim["crps"]),
                            "crps_rank": df_rank_sub[(df_rank_sub["n_sim"] == i_sim)]["rank"].iloc[0],
                            "lgt": df_agg_sim["lgt"].iloc[0],
                            "lgt_ref": np.mean(df_ens_sim["lgt"]),
                            "me": df_agg_sim["me"].iloc[0],
                            "me_ref": np.mean(df_ens_sim["me"]),
                            "cov": df_agg_sim["cov"].iloc[0],
                            "cov_ref": np.mean(df_ens_sim["cov"]),
                            "var_pit": df_agg_sim["var_pit"].iloc[0],
                            "var_pit_ref": np.mean(df_ens_sim["var_pit"]),
                            "div_crps": df_div_sub["crps"].iloc[0],
                            "div_lgt": df_div_sub["lgt"].iloc[0],
                            "div_loc": df_div_sub["loc"].iloc[0],
                            "div_crps_z": df_div_sub["z_crps"].iloc[0],
                            "div_lgt_z": df_div_sub["z_lgt"].iloc[0],
                            "div_loc_z": df_div_sub["z_loc"].iloc[0],
                        }

                        df_plot = pd.concat(
                            [df_plot, pd.DataFrame(new_row, index=[0])],
                            ignore_index=True,
                        )

gusts
protein
naval
power
kin8nm
scen_1
scen_4
wine
concrete
energy
boston
yacht


In [14]:
# Calculate some measures
df_plot["crpss"] = 100*(1 - df_plot["crps"]/df_plot["crps_ref"])
df_plot["lgt_dif"] = df_plot["lgt"] - df_plot["lgt_ref"]
df_plot["lgt_reldif"] = df_plot["lgt_dif"]/df_plot["lgt_ref"]
df_plot["ame"] = np.abs(df_plot["me"])
df_plot["ame_ref"] = np.abs(df_plot["me_ref"])
df_plot["ame_dif"] = df_plot["ame"] - df_plot["ame_ref"]
df_plot["ame_reldif"] = df_plot["ame_dif"]/df_plot["ame_ref"]
df_plot["disp"] = df_plot["var_pit"] - 1/12
df_plot["disp_ref"] = df_plot["var_pit_ref"] - 1/12

Save data

In [15]:
# Save data
with open(f"{git_path}/src/paper_figures_data.pkl", "wb") as f:
    pickle.dump(df_plot, f)