In [None]:
import os
import sys
import pickle
import time
import numpy as np
import pandas as pd
from sklearn.metrics import brier_score_loss, mean_absolute_error

# Set paths
#os.chdir(os.path.dirname(__file__))
sys.path.extend([
    os.path.abspath(os.path.join("../..")),
    os.path.abspath(os.path.join("../../../BayesFlow_dev/BayesFlow/"))
])

# Import from relative paths
from src.python.helpers import MaskingConfigurator, get_latex_results_table
from src.python.training import load_training_data
from src.python.ensemble import get_ensemble_predictions
import bayesflow as bf

# Silence tensorflow warnings and BayesFlow info logging
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [2]:
# Setup 
POWER_SCALINGS = [True, False]
N_ENSEMBLE_MEMBERS = 20
N_CHUNKS = 200
N_MODELS = 4
SUMMARY_DIM = 64

In [3]:
# Get predictions
try: 
    with open(f"../../data/levy_comparison/predictions/preds_sim_powerscaled.pkl", "rb") as file:
        scaled_sim_dict = pickle.load(file)

    with open(f"../../data/levy_comparison/predictions/preds_sim_unscaled.pkl", "rb") as file:
        unscaled_sim_dict = pickle.load(file)
    
except:
    os.makedirs("../../data/levy_comparison/predictions", exist_ok=True)
    
    # Takes ~30 seconds for initial data configuration and ~20 seconds per network on 8000 test data sets
    for power_scaling in POWER_SCALINGS:
        scaling_name = "sim_powerscaled" if power_scaling else "sim_unscaled"

        # Load and configure data
        test_data = load_training_data(scaling_name, "test")
        masking_configurator = MaskingConfigurator(power_scaling=power_scaling)
        test_data = masking_configurator(test_data)

        embeddings, pmps, logits = get_ensemble_predictions(
            path=f"ensemble_checkpoints/{scaling_name}", 
            data=test_data, 
            num_models=N_MODELS,
            summary_dim=SUMMARY_DIM, 
            predict_in_chunks=True,
            num_chunks=N_CHUNKS
        )
    
        pred_dict = {
            "preds": pmps,
            "logits": logits,
            "summary_output": embeddings,
            "sim_indices": test_data["model_indices"]
        }

        with open(f'../../data/levy_comparison/predictions/preds_{scaling_name}.pkl', 'wb') as file:
            pickle.dump(pred_dict, file)

    with open(f"../../data/levy_comparison/predictions/preds_sim_powerscaled.pkl", "rb") as file:
        scaled_sim_dict = pickle.load(file)

    with open(f"../../data/levy_comparison/predictions/preds_sim_unscaled.pkl", "rb") as file:
        unscaled_sim_dict = pickle.load(file)

In [None]:
# # Measure inference times
# inference_times_per_net = {}

# for power_scaling in POWER_SCALINGS:
#     scaling_name = "sim_powerscaled" if power_scaling else "sim_unscaled"

#     # Load and configure data
#     test_data = load_training_data(scaling_name, "test")
#     masking_configurator = MaskingConfigurator(power_scaling=power_scaling)
#     test_data = masking_configurator(test_data)

#     start_time = time.time()
#     embeddings, pmps, logits = get_ensemble_predictions(
#         path=f"ensemble_checkpoints/{scaling_name}", 
#         data=test_data, 
#         num_models=N_MODELS,
#         summary_dim=SUMMARY_DIM, 
#         predict_in_chunks=True,
#         num_chunks=N_CHUNKS
#     )
#     end_time = time.time()

#     pred_dict = {
#         "preds": pmps,
#         "logits": logits,
#         "summary_output": embeddings,
#         "sim_indices": test_data["model_indices"]
#     }
#     inference_times_per_net[str(power_scaling)] = (end_time - start_time) / N_ENSEMBLE_MEMBERS

# # Print times
# print("Seconds powerscaled inference per net:", inference_times_per_net["True"])
# print("Mins powerscaled inference per net:", inference_times_per_net["True"] / 60)
# print("Mins powerscaled 1000 priors training+inference:", 66 + (inference_times_per_net["True"] / 60) * 1000)
# print("Mins unscaled 1000 priors training+inference:", (66 + inference_times_per_net["False"] / 60) * 1000)
# print("Hours powerscaled 1000 priors training+inference:", (66 + (inference_times_per_net["True"] / 60) * 1000) / 60)
# print("Hours unscaled 1000 priors training+inference:", ((66 + inference_times_per_net["False"] / 60) * 1000) / 60)

In [5]:
sim_dicts = [scaled_sim_dict, unscaled_sim_dict]
metrics_dict = {}
metrics_names = ["ECE", "Brier Score", "MAE", "Accuracy"]
scaling_names = ["sim_powerscaled", "sim_unscaled"]

# Get Expected Calibration Error, Maximum Calibration Error, MAE, Accuracy
for i, sim_dict in enumerate(sim_dicts):
    metrics = np.zeros((N_ENSEMBLE_MEMBERS, N_MODELS, len(metrics_names)))
    m_true = sim_dict["sim_indices"]

    for network in range(N_ENSEMBLE_MEMBERS):
        m_pred = sim_dict["preds"][network, ...]

        eces, probs_true, probs_pred = bf.computational_utilities.expected_calibration_error(m_true=m_true, m_pred=m_pred)
        brier_scores = [brier_score_loss(y_true=m_true[:, m], y_prob=m_pred[:, m]) for m in range(N_MODELS)]
        maes = mean_absolute_error(y_true=m_true, y_pred=m_pred, multioutput="raw_values")
        accuracies = [np.mean(m_true[:, m] == (m_pred[:, m] > .5)) for m in range(N_MODELS)]
        
        metrics[network, :, 0] = eces
        metrics[network, :, 1] = brier_scores
        metrics[network, :, 2] = maes
        metrics[network, :, 3] = accuracies

    metrics_dict[scaling_names[i]] = metrics

In [6]:
# Create tables
scaled_validation = pd.DataFrame(
    np.mean(metrics_dict["sim_powerscaled"], axis=1), columns=metrics_names
)
scaled_validation.index = [f'Network {i}' for i in range(1, N_ENSEMBLE_MEMBERS + 1)]
scaled_validation

Unnamed: 0,ECE,Brier Score,MAE,Accuracy
Network 1,0.008775,0.011805,0.016615,0.985281
Network 2,0.012118,0.028105,0.047555,0.962094
Network 3,0.007228,0.009029,0.012031,0.989062
Network 4,0.005228,0.005826,0.007289,0.993281
Network 5,0.008009,0.010365,0.014341,0.987313
Network 6,0.008039,0.010362,0.013981,0.987688
Network 7,0.008318,0.010263,0.013934,0.987719
Network 8,0.009441,0.012078,0.016548,0.98525
Network 9,0.007846,0.009115,0.011449,0.989625
Network 10,0.01154,0.021141,0.034136,0.971719


In [7]:
unscaled_validation = pd.DataFrame(
    np.mean(metrics_dict["sim_unscaled"], axis=1), columns=metrics_names
)
unscaled_validation.index = [f'Network {i}' for i in range(1, N_ENSEMBLE_MEMBERS + 1)]
unscaled_validation

Unnamed: 0,ECE,Brier Score,MAE,Accuracy
Network 1,0.003139,0.003237,0.003798,0.996437
Network 2,0.004203,0.005006,0.006647,0.994188
Network 3,0.008699,0.012392,0.018302,0.984313
Network 4,0.00753,0.017558,0.030037,0.976719
Network 5,0.005656,0.014477,0.025402,0.980812
Network 6,0.001933,0.001928,0.002288,0.997938
Network 7,0.002714,0.002878,0.00357,0.996625
Network 8,0.00896,0.01364,0.0205,0.982125
Network 9,0.002456,0.002379,0.002887,0.997313
Network 10,0.002239,0.002549,0.003206,0.997


In [8]:
# Save validation results to combine with application results
with open(f"../../data/levy_comparison/validation_results/validation_scaled_unscaled.pkl", "wb") as file:
    val_results_dict = {
        "scaled": scaled_validation,
        "unscaled": unscaled_validation
    }
    pickle.dump(val_results_dict, file)