From 1a2e35d040b8925939db55551b3e7340a67f8bb2 Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Fri, 4 Apr 2025 19:12:52 -0400 Subject: [PATCH 1/7] Add c2st --- .../metrics/classifier_two_sample_test.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 bayesflow/diagnostics/metrics/classifier_two_sample_test.py diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py new file mode 100644 index 000000000..743ccf63e --- /dev/null +++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py @@ -0,0 +1,121 @@ +from typing import Sequence, Mapping, Any + +import numpy as np + +import keras + +from bayesflow.utils.exceptions import ShapeError +from bayesflow.networks import MLP + + +def classifier_two_sample_test( + estimates: np.ndarray, + targets: np.ndarray, + metric: str = "accuracy", + patience: int = 10, + max_epochs: int = 1000, + batch_size: int = 64, + return_metric_only: bool = True, + validation_split: float = 0.5, + standardize: bool = True, + mlp_widths: Sequence = (256, 256), + **kwargs, +) -> float | Mapping[str, Any]: + """ + C2ST metric [1] between samples from two distributions computed using a neural classifier. + Can be computationally expensive, since it involves training of a neural classifier. + + Note: works best for large numbers of samples and averaged across different posteriors. + + Code adapted from https://github.com/sbi-benchmark/sbibm/blob/main/sbibm/metrics/c2st.py + + [1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545. + + Parameters + ---------- + estimates : np.ndarray + Array of shape (num_samples_est, num_variables) containing samples representing estimated quantities + (e.g., approximate posterior samples). + targets : np.ndarray + Array of shape (num_samples_tar, num_variables) containing target samples + (e.g., samples from a reference posterior). + metric : str, optional + Metric to evaluate the classifier performance. Default is "accuracy". + patience : int, optional + Number of epochs with no improvement after which training will be stopped. Default is 5. + max_epochs : int, optional + Maximum number of epochs to train the classifier. Default is 1000. + batch_size : int, optional + Number of samples per batch during training. Default is 64. + return_metric_only : bool, optional + If True, only the final validation metric is returned. Otherwise, a dictionary with the score, classifier, and + full training history is returned. Default is True. + validation_split : float, optional + Fraction of the training data to be used as validation data. Default is 0.5. + standardize : bool, optional + If True, both estimates and targets will be standardized using the mean and standard deviation of estimates. + Default is True. + mlp_widths : Sequence[int], optional + Sequence specifying the number of units in each hidden layer of the MLP classifier. Default is (256, 256). + **kwargs + Additional keyword arguments. Recognized keyword: + mlp_kwargs : dict + Dictionary of additional parameters to pass to the MLP constructor. + + Returns + ------- + results : float or dict + If return_metric_only is True, returns the final validation metric (e.g., accuracy) as a float. + Otherwise, returns a dictionary with keys "score", "classifier", and "history", where "score" + is the final validation metric, "classifier" is the trained Keras model, and "history" contains the + full training history. + """ + + # Convert tensors to numpy, if passed + estimates = keras.ops.convert_to_numpy(estimates) + targets = keras.ops.convert_to_numpy(targets) + + # Error, if targets dim does not match estimates dim + num_dims = estimates.shape[1] + if not num_dims == targets.shape[1]: + raise ShapeError( + f"estimates and targets can have different number of samples (1st dim)" + f"but must have the same dimensionality (2nd dim)" + f"found: estimates shape {estimates.shape[1]}, targets shape {targets.shape[1]}" + ) + + # Standardize both estimates and targets relative to estimates mean and std + if standardize: + estimates_mean = np.mean(estimates, axis=0) + estimates_std = np.std(estimates, axis=0) + estimates = (estimates - estimates_mean) / estimates_std + targets = (targets - estimates_mean) / estimates_std + + # Create data for classification task + data = np.r_[estimates, targets] + labels = np.r_[np.zeros((estimates.shape[0],)), np.ones((targets.shape[0],))] + + # Create and train classifier with optional stopping + classifier = keras.Sequential( + [MLP(widths=mlp_widths, **kwargs.get("mlp_kwargs", {})), keras.layers.Dense(1, activation="sigmoid")] + ) + + classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics=[metric]) + + early_stopping = keras.callbacks.EarlyStopping( + monitor=f"val_{metric}", patience=patience, restore_best_weights=True + ) + + history = classifier.fit( + x=data, + y=labels, + epochs=max_epochs, + batch_size=batch_size, + verbose=0, + callbacks=[early_stopping], + validation_split=validation_split, + ) + + if return_metric_only: + return history.history[f"val_{metric}"][-1] + return {"score": history.history[f"val_{metric}"][-1], "classifier": classifier, "history": history.history} From ee201d553773e8f40dae8faa6cb2ba65ad213293 Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Mon, 7 Apr 2025 17:22:23 -0400 Subject: [PATCH 2/7] Remove sklearn as dep --- bayesflow/diagnostics/metrics/__init__.py | 1 + .../metrics/expected_calibration_error.py | 18 ++-- .../diagnostics/plots/mc_confusion_matrix.py | 12 ++- bayesflow/utils/__init__.py | 1 + bayesflow/utils/classification/__init__.py | 2 + .../utils/classification/calibration_curve.py | 82 +++++++++++++++++++ .../utils/classification/confusion_matrix.py | 58 +++++++++++++ pyproject.toml | 1 - tests/test_diagnostics/conftest.py | 10 +++ .../test_diagnostics_metrics.py | 8 ++ 10 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 bayesflow/utils/classification/__init__.py create mode 100644 bayesflow/utils/classification/calibration_curve.py create mode 100644 bayesflow/utils/classification/confusion_matrix.py diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py index 1fb5f2b8d..ceeca4cc4 100644 --- a/bayesflow/diagnostics/metrics/__init__.py +++ b/bayesflow/diagnostics/metrics/__init__.py @@ -2,3 +2,4 @@ from .posterior_contraction import posterior_contraction from .root_mean_squared_error import root_mean_squared_error from .expected_calibration_error import expected_calibration_error +from .classifier_two_sample_test import classifier_two_sample_test diff --git a/bayesflow/diagnostics/metrics/expected_calibration_error.py b/bayesflow/diagnostics/metrics/expected_calibration_error.py index bf6fb951e..81f4d9cb6 100644 --- a/bayesflow/diagnostics/metrics/expected_calibration_error.py +++ b/bayesflow/diagnostics/metrics/expected_calibration_error.py @@ -3,14 +3,14 @@ from typing import Sequence, Any, Mapping from ...utils.exceptions import ShapeError -from sklearn.calibration import calibration_curve +from ...utils.classification import calibration_curve def expected_calibration_error( estimates: np.ndarray, targets: np.ndarray, model_names: Sequence[str] = None, - n_bins: int = 10, + num_bins: int = 10, return_probs: bool = False, ) -> Mapping[str, Any]: """ @@ -31,11 +31,11 @@ def expected_calibration_error( The one-hot-encoded true model indices. model_names : Sequence[str], optional (default = None) Optional model names to show in the output. By default, models are called "M_" + model index. - n_bins : int, optional, default: 10 + num_bins : int, optional, default: 10 The number of bins to use for the calibration curves (and marginal histograms). - Passed into ``sklearn.calibration.calibration_curve()``. + Passed into ``bayesflow.utils.calibration_curve()``. return_probs : bool (default = False) - Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``? + Do you want to obtain the output of ``bayesflow.utils.calibration_curve()``? Returns ------- @@ -48,9 +48,9 @@ def expected_calibration_error( - "model_names" : str The (inferred) variable names. - "probs_true": (optional) list[np.ndarray]: - Outputs of ``sklearn.calibration.calibration_curve()`` per model + Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model - "probs_pred": (optional) list[np.ndarray]: - Outputs of ``sklearn.calibration.calibration_curve()`` per model + Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model """ # Convert tensors to numpy, if passed @@ -76,10 +76,10 @@ def expected_calibration_error( for model_index in range(estimates.shape[-1]): y_true = (targets == model_index).astype(np.float32) y_prob = estimates[..., model_index] - prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins) + prob_true, prob_pred = calibration_curve(y_true, y_prob, num_bins=num_bins) # Compute ECE by weighting bin errors by bin size - bins = np.linspace(0.0, 1.0, n_bins + 1) + bins = np.linspace(0.0, 1.0, num_bins + 1) binids = np.searchsorted(bins[1:-1], y_prob) bin_total = np.bincount(binids, minlength=len(bins)) nonzero = bin_total != 0 diff --git a/bayesflow/diagnostics/plots/mc_confusion_matrix.py b/bayesflow/diagnostics/plots/mc_confusion_matrix.py index 2d27b5073..db5165eae 100644 --- a/bayesflow/diagnostics/plots/mc_confusion_matrix.py +++ b/bayesflow/diagnostics/plots/mc_confusion_matrix.py @@ -5,9 +5,8 @@ from matplotlib.colors import LinearSegmentedColormap import numpy as np -from sklearn.metrics import confusion_matrix - -from bayesflow.utils.plot_utils import make_figure +from ...utils.plot_utils import make_figure +from ...utils.classification import confusion_matrix def mc_confusion_matrix( @@ -50,10 +49,9 @@ def mc_confusion_matrix( ytick_rotation: int, optional, default: None Rotation of y-axis tick labels (helps with long model names). normalize : {'true', 'pred', 'all'}, default=None - Passed to sklearn.metrics.confusion_matrix. - Normalizes confusion matrix over the true (rows), predicted (columns) - conditions or all the population. If None, confusion matrix will not be - normalized. + Passed to confusion matrix. Normalizes confusion matrix over the true (rows), + predicted (columns) conditions or all the population. If None, confusion matrix + will not be normalized. cmap : matplotlib.colors.Colormap or str, optional, default: None Colormap to be used for the cells. If a str, it should be the name of a registered colormap, e.g., 'viridis'. Default colormap matches the BayesFlow defaults by ranging from white to red. diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py index a359e0d8a..1e7be7018 100644 --- a/bayesflow/utils/__init__.py +++ b/bayesflow/utils/__init__.py @@ -72,6 +72,7 @@ tree_stack, fill_triangular_matrix, ) +from .classification import calibration_curve, confusion_matrix from .validators import check_lengths_same from .workflow_utils import find_inference_network, find_summary_network diff --git a/bayesflow/utils/classification/__init__.py b/bayesflow/utils/classification/__init__.py new file mode 100644 index 000000000..94e6dbdfd --- /dev/null +++ b/bayesflow/utils/classification/__init__.py @@ -0,0 +1,2 @@ +from .confusion_matrix import confusion_matrix +from .calibration_curve import calibration_curve diff --git a/bayesflow/utils/classification/calibration_curve.py b/bayesflow/utils/classification/calibration_curve.py new file mode 100644 index 000000000..427449094 --- /dev/null +++ b/bayesflow/utils/classification/calibration_curve.py @@ -0,0 +1,82 @@ +import numpy as np + + +def calibration_curve( + targets: np.ndarray, + estimates: np.ndarray, + *, + pos_label: int | float | bool | str = 1, + num_bins: int = 5, + strategy: str = "uniform", +): + """Compute true and predicted probabilities for a calibration curve. + + The method assumes the inputs come from a binary classifier, and + discretize the [0, 1] interval into bins. + + Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927 + + Parameters + ---------- + targets : array-like of shape (n_samples,) + True targets. + estimates : array-like of shape (n_samples,) + Probabilities of the positive class. + pos_label : int, float, bool or str, default = 1 + The label of the positive class. + num_bins : int, default=5 + Number of bins to discretize the [0, 1] interval. A bigger number + requires more data. Bins with no samples (i.e. without + corresponding values in `y_prob`) will not be returned, thus the + returned arrays may have less than `n_bins` values. + strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. + + Returns + ------- + prob_true : ndarray of shape (n_bins,) or smaller + The proportion of samples whose class is the positive class, in each + bin (fraction of positives). + + prob_pred : ndarray of shape (n_bins,) or smaller + The mean estimated probability in each bin. + + References + ---------- + Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good + Probabilities With Supervised Learning, in Proceedings of the 22nd + International Conference on Machine Learning (ICML). + """ + + if estimates.min() < 0 or estimates.max() > 1: + raise ValueError("y_prob has values outside [0, 1].") + + labels = np.unique(targets) + if len(labels) > 2: + raise ValueError(f"Only binary classification is supported. Provided labels {labels}.") + targets = targets == pos_label + + if strategy == "quantile": # Determine bin edges by distribution of data + quantiles = np.linspace(0, 1, num_bins + 1) + bins = np.percentile(estimates, quantiles * 100) + elif strategy == "uniform": + bins = np.linspace(0.0, 1.0, num_bins + 1) + else: + raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.") + + binids = np.searchsorted(bins[1:-1], estimates) + + bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins)) + bin_true = np.bincount(binids, weights=targets, minlength=len(bins)) + bin_total = np.bincount(binids, minlength=len(bins)) + + nonzero = bin_total != 0 + prob_true = bin_true[nonzero] / bin_total[nonzero] + prob_pred = bin_sums[nonzero] / bin_total[nonzero] + + return prob_true, prob_pred diff --git a/bayesflow/utils/classification/confusion_matrix.py b/bayesflow/utils/classification/confusion_matrix.py new file mode 100644 index 000000000..c9a4a9e5d --- /dev/null +++ b/bayesflow/utils/classification/confusion_matrix.py @@ -0,0 +1,58 @@ +from typing import Sequence + +import numpy as np + + +def confusion_matrix(targets: np.ndarray, estimates: np.ndarray, labels: Sequence = None, normalize: str = None): + """ + Compute confusion matrix to evaluate the accuracy of a classification or model comparison setting. + + Code inspired by: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/metrics/_classification.py + + Parameters + ---------- + targets : np.ndarray + Ground truth (correct) target values. + estimates : np.ndarray + Estimated targets as returned by a classifier. + labels : Sequence, optional + List of labels to index the matrix. This may be used to reorder or select a subset of labels. + If None, labels that appear at least once in y_true or y_pred are used in sorted order. + normalize : {'true', 'pred', 'all'}, optional + Normalizes confusion matrix over the true (rows), predicted (columns) + conditions or all the population. If None, no normalization is applied. + + Returns + ------- + cm : np.ndarray of shape (num_labels, num_labels) + Confusion matrix. Rows represent true classes, columns represent predicted classes. + """ + + # Get unique labels + labels = np.asarray(labels) or np.unique(np.concatenate((targets, estimates))) + + label_to_index = {label: i for i, label in enumerate(labels)} + num_labels = len(labels) + + # Initialize the confusion matrix + cm = np.zeros((num_labels, num_labels), dtype=np.int64) + + # Fill confusion matrix + for t, p in zip(targets, estimates): + if t in label_to_index and p in label_to_index: + cm[label_to_index[t], label_to_index[p]] += 1 + + # Normalize if required + if normalize == "true": + with np.errstate(all="ignore"): + cm = cm.astype(np.float64) + cm = np.divide(cm, cm.sum(axis=1, keepdims=True), where=cm.sum(axis=1, keepdims=True) != 0) + elif normalize == "pred": + with np.errstate(all="ignore"): + cm = cm.astype(np.float64) + cm = np.divide(cm, cm.sum(axis=0, keepdims=True), where=cm.sum(axis=0, keepdims=True) != 0) + elif normalize == "all": + cm = cm.astype(np.float64) + cm /= cm.sum() + + return cm diff --git a/pyproject.toml b/pyproject.toml index 5f83611f2..8018b55a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,6 @@ dependencies = [ "matplotlib", "numpy >= 1.24, <2.0", "pandas", - "scikit-learn", "scipy", "seaborn", "tqdm", diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py index 9bd9bd907..6ecff445d 100644 --- a/tests/test_diagnostics/conftest.py +++ b/tests/test_diagnostics/conftest.py @@ -8,6 +8,16 @@ def var_names(): return [r"$\beta_0$", r"$\beta_1$", r"$\sigma$"] +@pytest.fixture() +def random_samples_a(): + return np.random.normal(loc=0, scale=1, size=(1000, 8)) + + +@pytest.fixture() +def random_samples_b(): + return np.random.normal(loc=0, scale=3, size=(1000, 8)) + + @pytest.fixture() def random_estimates(): return { diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 792cf558a..017b71555 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -50,6 +50,14 @@ def test_root_mean_squared_error(random_estimates, random_targets): assert out["variable_names"] == ["beta_0", "beta_1", "sigma"] +def test_classifier_two_sample_test(random_samples_a, random_samples_b): + metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_a) + assert 0.6 > metric > 0.4 + + metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_b) + assert metric > 0.6 + + def test_expected_calibration_error(pred_models, true_models, model_names): out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=model_names) assert list(out.keys()) == ["values", "metric_name", "model_names"] From d795788ae3c70fe606debe916340544f61271b35 Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Mon, 7 Apr 2025 19:21:28 -0400 Subject: [PATCH 3/7] Add test and fix weird keras behavior --- .../diagnostics/metrics/classifier_two_sample_test.py | 11 ++++++++--- tests/test_diagnostics/conftest.py | 4 ++-- tests/test_diagnostics/test_diagnostics_metrics.py | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py index 743ccf63e..c2750ee80 100644 --- a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py +++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py @@ -12,13 +12,13 @@ def classifier_two_sample_test( estimates: np.ndarray, targets: np.ndarray, metric: str = "accuracy", - patience: int = 10, + patience: int = 5, max_epochs: int = 1000, - batch_size: int = 64, + batch_size: int = 128, return_metric_only: bool = True, validation_split: float = 0.5, standardize: bool = True, - mlp_widths: Sequence = (256, 256), + mlp_widths: Sequence = (64, 64), **kwargs, ) -> float | Mapping[str, Any]: """ @@ -95,6 +95,11 @@ def classifier_two_sample_test( data = np.r_[estimates, targets] labels = np.r_[np.zeros((estimates.shape[0],)), np.ones((targets.shape[0],))] + # Important: needed, since keras does not shuffle before selecting validation split + shuffle_idx = np.random.permutation(data.shape[0]) + data = data[shuffle_idx] + labels = labels[shuffle_idx] + # Create and train classifier with optional stopping classifier = keras.Sequential( [MLP(widths=mlp_widths, **kwargs.get("mlp_kwargs", {})), keras.layers.Dense(1, activation="sigmoid")] diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py index 6ecff445d..5103ea455 100644 --- a/tests/test_diagnostics/conftest.py +++ b/tests/test_diagnostics/conftest.py @@ -10,12 +10,12 @@ def var_names(): @pytest.fixture() def random_samples_a(): - return np.random.normal(loc=0, scale=1, size=(1000, 8)) + return np.random.normal(loc=0, scale=1, size=(5000, 8)) @pytest.fixture() def random_samples_b(): - return np.random.normal(loc=0, scale=3, size=(1000, 8)) + return np.random.normal(loc=0, scale=3, size=(5000, 8)) @pytest.fixture() diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py index 017b71555..4fb0945b3 100644 --- a/tests/test_diagnostics/test_diagnostics_metrics.py +++ b/tests/test_diagnostics/test_diagnostics_metrics.py @@ -52,10 +52,10 @@ def test_root_mean_squared_error(random_estimates, random_targets): def test_classifier_two_sample_test(random_samples_a, random_samples_b): metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_a) - assert 0.6 > metric > 0.4 + assert 0.55 > metric > 0.45 metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_b) - assert metric > 0.6 + assert metric > 0.55 def test_expected_calibration_error(pred_models, true_models, model_names): From cfdfed67c05e63c0090e99f79152df5acd3be176 Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Mon, 7 Apr 2025 20:04:54 -0400 Subject: [PATCH 4/7] Fix n_bins to num_bins --- bayesflow/diagnostics/plots/calibration_histogram.py | 2 +- bayesflow/diagnostics/plots/mc_calibration.py | 12 ++++++------ bayesflow/utils/classification/calibration_curve.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bayesflow/diagnostics/plots/calibration_histogram.py b/bayesflow/diagnostics/plots/calibration_histogram.py index c7c2de408..6ff10c881 100644 --- a/bayesflow/diagnostics/plots/calibration_histogram.py +++ b/bayesflow/diagnostics/plots/calibration_histogram.py @@ -102,7 +102,7 @@ def calibration_histogram( "Confidence intervals might be unreliable!" ) - # Set n_bins automatically, if nothing provided + # Set num_bins automatically, if nothing provided if num_bins is None: num_bins = int(ratio / 2) # Attempt a fix if a single bin is determined so plot still makes sense diff --git a/bayesflow/diagnostics/plots/mc_calibration.py b/bayesflow/diagnostics/plots/mc_calibration.py index 1f1e3678f..0e80241e5 100644 --- a/bayesflow/diagnostics/plots/mc_calibration.py +++ b/bayesflow/diagnostics/plots/mc_calibration.py @@ -18,7 +18,7 @@ def mc_calibration( pred_models: dict[str, np.ndarray] | np.ndarray, true_models: dict[str, np.ndarray] | np.ndarray, model_names: Sequence[str] = None, - n_bins: int = 10, + num_bins: int = 10, label_fontsize: int = 16, title_fontsize: int = 18, metric_fontsize: int = 14, @@ -41,12 +41,12 @@ def mc_calibration( The one-hot-encoded true model indices per data set. model_names : list or None, optional, default: None The model names for nice plot titles. Inferred if None. - n_bins : int, optional, default: 10 + num_bins : int, optional, default: 10 The number of bins to use for the calibration curves (and marginal histograms). label_fontsize : int, optional, default: 16 The font size of the y-label and y-label texts - legend_fontsize : int, optional, default: 14 - The font size of the legend text (ECE value) + metric_fontsize : int, optional, default: 14 + The font size of the metric (e.g., ECE) title_fontsize : int, optional, default: 18 The font size of the title text. Only relevant if `stacked=False` tick_fontsize : int, optional, default: 12 @@ -83,7 +83,7 @@ def mc_calibration( estimates=pred_models, targets=true_models, model_names=plot_data["variable_names"], - n_bins=n_bins, + num_bins=num_bins, return_probs=True, ) @@ -92,7 +92,7 @@ def mc_calibration( ax.plot(ece["probs_pred"][j], ece["probs_true"][j], "o-", color=color) # Plot PMP distribution over bins - uniform_bins = np.linspace(0.0, 1.0, n_bins + 1) + uniform_bins = np.linspace(0.0, 1.0, num_bins + 1) norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"]) ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3) diff --git a/bayesflow/utils/classification/calibration_curve.py b/bayesflow/utils/classification/calibration_curve.py index 427449094..38e49fc08 100644 --- a/bayesflow/utils/classification/calibration_curve.py +++ b/bayesflow/utils/classification/calibration_curve.py @@ -27,8 +27,8 @@ def calibration_curve( num_bins : int, default=5 Number of bins to discretize the [0, 1] interval. A bigger number requires more data. Bins with no samples (i.e. without - corresponding values in `y_prob`) will not be returned, thus the - returned arrays may have less than `n_bins` values. + corresponding values in `estimates`) will not be returned, thus the + returned arrays may have less than `num_bins` values. strategy : {'uniform', 'quantile'}, default='uniform' Strategy used to define the widths of the bins. @@ -39,11 +39,11 @@ def calibration_curve( Returns ------- - prob_true : ndarray of shape (n_bins,) or smaller + prob_true : ndarray of shape (num_bins,) or smaller The proportion of samples whose class is the positive class, in each bin (fraction of positives). - prob_pred : ndarray of shape (n_bins,) or smaller + prob_pred : ndarray of shape (num_bins,) or smaller The mean estimated probability in each bin. References From 0ed2524d867f49e9d3809e123035e19bfe91480b Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Tue, 8 Apr 2025 11:03:08 -0400 Subject: [PATCH 5/7] Fix torch autograd bug --- .../metrics/classifier_two_sample_test.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py index c2750ee80..1dd87ea49 100644 --- a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py +++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py @@ -71,10 +71,6 @@ def classifier_two_sample_test( full training history. """ - # Convert tensors to numpy, if passed - estimates = keras.ops.convert_to_numpy(estimates) - targets = keras.ops.convert_to_numpy(targets) - # Error, if targets dim does not match estimates dim num_dims = estimates.shape[1] if not num_dims == targets.shape[1]: @@ -111,15 +107,30 @@ def classifier_two_sample_test( monitor=f"val_{metric}", patience=patience, restore_best_weights=True ) - history = classifier.fit( - x=data, - y=labels, - epochs=max_epochs, - batch_size=batch_size, - verbose=0, - callbacks=[early_stopping], - validation_split=validation_split, - ) + # For now, we need to enable grads, since we turn them off by default + if keras.backend.backend() == "torch": + import torch + + with torch.enable_grad(): + history = classifier.fit( + x=data, + y=labels, + epochs=max_epochs, + batch_size=batch_size, + verbose=0, + callbacks=[early_stopping], + validation_split=validation_split, + ) + else: + history = classifier.fit( + x=data, + y=labels, + epochs=max_epochs, + batch_size=batch_size, + verbose=0, + callbacks=[early_stopping], + validation_split=validation_split, + ) if return_metric_only: return history.history[f"val_{metric}"][-1] From 00ff92755c77ff7c8a8c9fd8264124868d27f3f8 Mon Sep 17 00:00:00 2001 From: stefanradev93 Date: Tue, 8 Apr 2025 11:07:52 -0400 Subject: [PATCH 6/7] Fix doc --- bayesflow/diagnostics/metrics/classifier_two_sample_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py index 1dd87ea49..d1382ae5d 100644 --- a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py +++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py @@ -23,12 +23,11 @@ def classifier_two_sample_test( ) -> float | Mapping[str, Any]: """ C2ST metric [1] between samples from two distributions computed using a neural classifier. - Can be computationally expensive, since it involves training of a neural classifier. + Can be computationally expensive if called in a loop[, since it needs to train the model + for each set of samples. Note: works best for large numbers of samples and averaged across different posteriors. - Code adapted from https://github.com/sbi-benchmark/sbibm/blob/main/sbibm/metrics/c2st.py - [1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545. Parameters From 8b9b16f5765eb01501cb5e1654b770e41aa40b1e Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 8 Apr 2025 14:11:30 -0400 Subject: [PATCH 7/7] Update bayesflow/utils/classification/confusion_matrix.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- bayesflow/utils/classification/confusion_matrix.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bayesflow/utils/classification/confusion_matrix.py b/bayesflow/utils/classification/confusion_matrix.py index c9a4a9e5d..4aadda28f 100644 --- a/bayesflow/utils/classification/confusion_matrix.py +++ b/bayesflow/utils/classification/confusion_matrix.py @@ -29,7 +29,10 @@ def confusion_matrix(targets: np.ndarray, estimates: np.ndarray, labels: Sequenc """ # Get unique labels - labels = np.asarray(labels) or np.unique(np.concatenate((targets, estimates))) + if labels is None: + labels = np.unique(np.concatenate((targets, estimates))) + else: + labels = np.asarray(labels) label_to_index = {label: i for i, label in enumerate(labels)} num_labels = len(labels)