diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
index 453793339..1fb5f2b8d 100644
--- a/bayesflow/diagnostics/metrics/__init__.py
+++ b/bayesflow/diagnostics/metrics/__init__.py
@@ -1,3 +1,4 @@
 from .calibration_error import calibration_error
 from .posterior_contraction import posterior_contraction
 from .root_mean_squared_error import root_mean_squared_error
+from .expected_calibration_error import expected_calibration_error
diff --git a/bayesflow/diagnostics/metrics/expected_calibration_error.py b/bayesflow/diagnostics/metrics/expected_calibration_error.py
new file mode 100644
index 000000000..d811e6f2e
--- /dev/null
+++ b/bayesflow/diagnostics/metrics/expected_calibration_error.py
@@ -0,0 +1,98 @@
+import numpy as np
+from keras import ops
+from typing import Sequence, Any, Mapping
+
+from ...utils.exceptions import ShapeError
+from sklearn.calibration import calibration_curve
+
+
+def expected_calibration_error(
+    estimates: np.ndarray,
+    targets: np.ndarray,
+    model_names: Sequence[str] = None,
+    n_bins: int = 10,
+    return_probs: bool = False,
+) -> Mapping[str, Any]:
+    """Estimates the expected calibration error (ECE) of a model comparison network according to [1].
+
+    [1] Naeini, M. P., Cooper, G., & Hauskrecht, M. (2015).
+        Obtaining well calibrated probabilities using bayesian binning.
+        In Proceedings of the AAAI conference on artificial intelligence (Vol. 29, No. 1).
+
+    Notes
+    -----
+    Make sure that ``targets`` are **one-hot encoded** classes!
+
+    Parameters
+    ----------
+    estimates      : array of shape (num_sim, num_models)
+        The predicted posterior model probabilities.
+    targets      : array of shape (num_sim, num_models)
+        The one-hot-encoded true model indices.
+    model_names : Sequence[str], optional (default = None)
+        Optional model names to show in the output. By default, models are called "M_" + model index.
+    n_bins    : int, optional, default: 10
+        The number of bins to use for the calibration curves (and marginal histograms).
+        Passed into ``sklearn.calibration.calibration_curve()``.
+    return_probs : bool (default = False)
+        Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``?
+
+    Returns
+    -------
+    result : dict
+        Dictionary containing:
+        - "values" : np.ndarray
+            The expected calibration error per model
+        - "metric_name" : str
+            The name of the metric ("Expected Calibration Error").
+        - "model_names" : str
+            The (inferred) variable names.
+        - "probs_true": (optional) list[np.ndarray]:
+            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+        - "probs_pred": (optional) list[np.ndarray]:
+            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+    """
+
+    # Convert tensors to numpy, if passed
+    estimates = ops.convert_to_numpy(estimates)
+    targets = ops.convert_to_numpy(targets)
+
+    if estimates.shape != targets.shape:
+        raise ShapeError("`estimates` and `targets` must have the same shape.")
+
+    if model_names is None:
+        model_names = ["M_" + str(i) for i in range(estimates.shape[-1])]
+    elif len(model_names) != estimates.shape[-1]:
+        raise ShapeError("There must be exactly one `model_name` for each model in `estimates`")
+
+    # Extract number of models and prepare containers
+    ece = []
+    probs_true = []
+    probs_pred = []
+
+    targets = targets.argmax(axis=-1)
+
+    # Loop for each model and compute calibration errs per bin
+    for model_index in range(estimates.shape[-1]):
+        y_true = (targets == model_index).astype(np.float32)
+        y_prob = estimates[..., model_index]
+        prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
+
+        # Compute ECE by weighting bin errors by bin size
+        bins = np.linspace(0.0, 1.0, n_bins + 1)
+        binids = np.searchsorted(bins[1:-1], y_prob)
+        bin_total = np.bincount(binids, minlength=len(bins))
+        nonzero = bin_total != 0
+        error = np.sum(np.abs(prob_true - prob_pred) * (bin_total[nonzero] / len(y_true)))
+
+        ece.append(error)
+        probs_true.append(prob_true)
+        probs_pred.append(prob_pred)
+
+    output = dict(values=np.array(ece), metric_name="Expected Calibration Error", model_names=model_names)
+
+    if return_probs:
+        output["probs_true"] = probs_true
+        output["probs_pred"] = probs_pred
+
+    return output
diff --git a/bayesflow/diagnostics/plots/mc_calibration.py b/bayesflow/diagnostics/plots/mc_calibration.py
index 535518afc..1f1e3678f 100644
--- a/bayesflow/diagnostics/plots/mc_calibration.py
+++ b/bayesflow/diagnostics/plots/mc_calibration.py
@@ -5,19 +5,20 @@
 
 
 from bayesflow.utils import (
-    expected_calibration_error,
     prepare_plot_data,
     add_titles_and_labels,
     add_metric,
     prettify_subplots,
 )
 
+from bayesflow.diagnostics.metrics import expected_calibration_error
+
 
 def mc_calibration(
     pred_models: dict[str, np.ndarray] | np.ndarray,
     true_models: dict[str, np.ndarray] | np.ndarray,
     model_names: Sequence[str] = None,
-    num_bins: int = 10,
+    n_bins: int = 10,
     label_fontsize: int = 16,
     title_fontsize: int = 18,
     metric_fontsize: int = 14,
@@ -40,7 +41,7 @@ def mc_calibration(
         The one-hot-encoded true model indices per data set.
     model_names       : list or None, optional, default: None
         The model names for nice plot titles. Inferred if None.
-    num_bins          : int, optional, default: 10
+    n_bins          : int, optional, default: 10
         The number of bins to use for the calibration curves (and marginal histograms).
     label_fontsize    : int, optional, default: 16
         The font size of the y-label and y-label texts
@@ -77,17 +78,21 @@ def mc_calibration(
         default_name="M",
     )
 
-    # Compute calibration
-    cal_errors, true_probs, pred_probs = expected_calibration_error(
-        plot_data["targets"], plot_data["estimates"], num_bins
+    # compute ece and probs
+    ece = expected_calibration_error(
+        estimates=pred_models,
+        targets=true_models,
+        model_names=plot_data["variable_names"],
+        n_bins=n_bins,
+        return_probs=True,
     )
 
     for j, ax in enumerate(plot_data["axes"].flat):
         # Plot calibration curve
-        ax.plot(pred_probs[j], true_probs[j], "o-", color=color)
+        ax.plot(ece["probs_pred"][j], ece["probs_true"][j], "o-", color=color)
 
         # Plot PMP distribution over bins
-        uniform_bins = np.linspace(0.0, 1.0, num_bins + 1)
+        uniform_bins = np.linspace(0.0, 1.0, n_bins + 1)
         norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"])
         ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3)
 
@@ -104,7 +109,7 @@ def mc_calibration(
         add_metric(
             ax,
             metric_text=r"$\widehat{{\mathrm{{ECE}}}}$",
-            metric_value=cal_errors[j],
+            metric_value=ece["values"][j],
             metric_fontsize=metric_fontsize,
         )
 
diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py
index 4b5482089..bacf406d6 100644
--- a/bayesflow/utils/__init__.py
+++ b/bayesflow/utils/__init__.py
@@ -4,7 +4,6 @@
     numpy_utils,
 )
 from .callbacks import detailed_loss_callback
-from .comp_utils import expected_calibration_error
 from .devices import devices
 from .dict_utils import (
     convert_args,
diff --git a/bayesflow/utils/comp_utils.py b/bayesflow/utils/comp_utils.py
deleted file mode 100644
index 4d4b5c550..000000000
--- a/bayesflow/utils/comp_utils.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import numpy as np
-from keras import ops
-
-from sklearn.calibration import calibration_curve
-
-
-def expected_calibration_error(m_true, m_pred, num_bins=10):
-    """Estimates the expected calibration error (ECE) of a model comparison network according to [1].
-
-    [1] Naeini, M. P., Cooper, G., & Hauskrecht, M. (2015).
-        Obtaining well calibrated probabilities using bayesian binning.
-        In Proceedings of the AAAI conference on artificial intelligence (Vol. 29, No. 1).
-
-    Notes
-    -----
-    Make sure that ``m_true`` are **one-hot encoded** classes!
-
-    Parameters
-    ----------
-    m_true      : array of shape (num_sim, num_models)
-        The one-hot-encoded true model indices.
-    m_pred      : array of shape (num_sim, num_models)
-        The predicted posterior model probabilities.
-    num_bins    : int, optional, default: 10
-        The number of bins to use for the calibration curves (and marginal histograms).
-
-    Returns
-    -------
-    cal_errs    : list of length (num_models)
-        The ECEs for each model.
-    probs       : list of length (num_models)
-        The bin information for constructing the calibration curves.
-        Each list contains two arrays of length (num_bins) with the predicted and true probabilities for each bin.
-    """
-
-    # Convert tensors to numpy, if passed
-    m_true = ops.convert_to_numpy(m_true)
-    m_pred = ops.convert_to_numpy(m_pred)
-
-    # Extract number of models and prepare containers
-    n_models = m_true.shape[1]
-    cal_errs = []
-    probs_true = []
-    probs_pred = []
-
-    # Loop for each model and compute calibration errs per bin
-    for k in range(n_models):
-        y_true = (m_true.argmax(axis=1) == k).astype(np.float32)
-        y_prob = m_pred[:, k]
-        prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=num_bins)
-
-        # Compute ECE by weighting bin errors by bin size
-        bins = np.linspace(0.0, 1.0, num_bins + 1)
-        binids = np.searchsorted(bins[1:-1], y_prob)
-        bin_total = np.bincount(binids, minlength=len(bins))
-        nonzero = bin_total != 0
-        cal_err = np.sum(np.abs(prob_true - prob_pred) * (bin_total[nonzero] / len(y_true)))
-
-        cal_errs.append(cal_err)
-        probs_true.append(prob_true)
-        probs_pred.append(prob_pred)
-    return cal_errs, probs_true, probs_pred
diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py
index 52e3e343a..9bd9bd907 100644
--- a/tests/test_diagnostics/conftest.py
+++ b/tests/test_diagnostics/conftest.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from bayesflow.utils.numpy_utils import softmax
 
 
 @pytest.fixture()
@@ -31,3 +32,22 @@ def random_priors():
         "sigma": np.random.standard_normal(size=(64, 1)),
         "y": np.random.standard_normal(size=(64, 3, 1)),
     }
+
+
+@pytest.fixture()
+def model_names():
+    return [r"$\mathcal{M}_0$", r"$\mathcal{M}_1$", r"$\mathcal{M}_2$"]
+
+
+@pytest.fixture()
+def true_models():
+    true_models = np.random.choice(3, 100)
+    true_models = np.eye(3)[true_models].astype(np.int32)
+    return true_models
+
+
+@pytest.fixture()
+def pred_models(true_models):
+    pred_models = np.random.normal(loc=true_models)
+    pred_models = softmax(pred_models, axis=-1)
+    return pred_models
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index 62b0b4ca1..792cf558a 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -1,4 +1,5 @@
 import bayesflow as bf
+import pytest
 
 
 def num_variables(x: dict):
@@ -47,3 +48,26 @@ def test_root_mean_squared_error(random_estimates, random_targets):
     assert out["values"].shape == (num_variables(random_estimates),)
     assert out["metric_name"] == "NRMSE"
     assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
+
+
+def test_expected_calibration_error(pred_models, true_models, model_names):
+    out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=model_names)
+    assert list(out.keys()) == ["values", "metric_name", "model_names"]
+    assert out["values"].shape == (pred_models.shape[-1],)
+    assert out["metric_name"] == "Expected Calibration Error"
+    assert out["model_names"] == [r"$\mathcal{M}_0$", r"$\mathcal{M}_1$", r"$\mathcal{M}_2$"]
+
+    # returns probs?
+    out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, return_probs=True)
+    assert list(out.keys()) == ["values", "metric_name", "model_names", "probs_true", "probs_pred"]
+    assert len(out["probs_true"]) == pred_models.shape[-1]
+    assert len(out["probs_pred"]) == pred_models.shape[-1]
+    # default: auto model names
+    assert out["model_names"] == ["M_0", "M_1", "M_2"]
+
+    # handles incorrect input?
+    with pytest.raises(Exception):
+        out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=["a"])
+
+    with pytest.raises(Exception):
+        out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models.transpose)
diff --git a/tests/test_diagnostics/test_diagnostics_plots.py b/tests/test_diagnostics/test_diagnostics_plots.py
index 97e21b2a2..f832535d1 100644
--- a/tests/test_diagnostics/test_diagnostics_plots.py
+++ b/tests/test_diagnostics/test_diagnostics_plots.py
@@ -102,3 +102,18 @@ def test_pairs_posterior(random_estimates, random_targets, random_priors):
             priors=random_priors,
             dataset_id=[1, 3],
         )
+
+
+def test_mc_calibration(pred_models, true_models, model_names):
+    out = bf.diagnostics.plots.mc_calibration(pred_models, true_models, model_names=model_names)
+    assert len(out.axes) == pred_models.shape[-1]
+    assert out.axes[0].get_ylabel() == "True Probability"
+    assert out.axes[0].get_xlabel() == "Predicted Probability"
+    assert out.axes[-1].get_title() == r"$\mathcal{M}_2$"
+
+
+def test_mc_confusion_matrix(pred_models, true_models, model_names):
+    out = bf.diagnostics.plots.mc_confusion_matrix(pred_models, true_models, model_names, normalize="true")
+    assert out.axes[0].get_ylabel() == "True model"
+    assert out.axes[0].get_xlabel() == "Predicted model"
+    assert out.axes[0].get_title() == "Confusion Matrix"