awslabs · mseeger · Jan 23, 2023 · Dec 16, 2022 · Jan 23, 2023 · Jan 23, 2023
diff --git a/docs/source/schedulers.rst b/docs/source/schedulers.rst
@@ -167,6 +167,9 @@ are given in ``search_options``. These are:
 
 * ``debug_log``: If ``True``, a useful log output about the search progress is
   printed.
+* ``allow_duplicates``: If ``True``, the same configuration may be suggested
+  more than once. The default is ``False``, in that sampling is without
+  replacement.
 
 Bayesian Optimization
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/syne_tune/optimizer/schedulers/searchers/__init__.py b/syne_tune/optimizer/schedulers/searchers/__init__.py
@@ -19,6 +19,7 @@
     SearcherWithRandomSeed,
     impute_points_to_evaluate,
     extract_random_seed,
+    SearcherWithRandomSeedAndFilterDuplicates,
 )
 from syne_tune.optimizer.schedulers.searchers.random_grid_searcher import (  # noqa: F401
     RandomSearcher,
@@ -33,6 +34,7 @@
     "SearcherWithRandomSeed",
     "impute_points_to_evaluate",
     "extract_random_seed",
+    "SearcherWithRandomSeedAndFilterDuplicates",
     "RandomSearcher",
     "GridSearcher",
     "searcher_factory",

diff --git a/syne_tune/optimizer/schedulers/searchers/bore/bore.py b/syne_tune/optimizer/schedulers/searchers/bore/bore.py
@@ -21,14 +21,7 @@
 from sklearn.calibration import CalibratedClassifierCV
 
 from syne_tune.optimizer.schedulers.searchers.searcher import (
-    SearcherWithRandomSeed,
-    sample_random_configuration,
-)
-from syne_tune.optimizer.schedulers.searchers.utils.hp_ranges_factory import (
-    make_hyperparameter_ranges,
-)
-from syne_tune.optimizer.schedulers.searchers.bayesopt.tuning_algorithms.common import (
-    ExclusionList,
+    SearcherWithRandomSeedAndFilterDuplicates,
 )
 from syne_tune.optimizer.schedulers.searchers.bore.de import (
     DifferentialevolutionOptimizer,
@@ -37,7 +30,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Bore(SearcherWithRandomSeed):
+class Bore(SearcherWithRandomSeedAndFilterDuplicates):
     """
     Implements "Bayesian optimization by Density Ratio Estimation" as described
     in the following paper:
@@ -48,7 +41,7 @@ class Bore(SearcherWithRandomSeed):
         | https://arxiv.org/abs/2102.09009
 
     Additional arguments on top of parent class
-    :class:`~syne_tune.optimizer.schedulers.searchers.SearcherWithRandomSeed`:
+    :class:`~syne_tune.optimizer.schedulers.searchers.SearcherWithRandomSeedAndFilterDuplicates`:
 
     :param mode: Can be "min" (default) or "max".
     :param gamma: Defines the percentile, i.e how many percent of configurations
@@ -76,6 +69,7 @@ def __init__(
         config_space: dict,
         metric: str,
         points_to_evaluate: Optional[List[dict]] = None,
+        allow_duplicates: Optional[bool] = None,
         mode: Optional[str] = None,
         gamma: Optional[float] = None,
         calibrate: Optional[bool] = None,
@@ -91,6 +85,7 @@ def __init__(
             config_space=config_space,
             metric=metric,
             points_to_evaluate=points_to_evaluate,
+            allow_duplicates=allow_duplicates,
             **kwargs,
         )
         if mode is None:
@@ -120,9 +115,6 @@ def __init__(
         self.random_prob = random_prob
         self.mode = mode
 
-        self._hp_ranges = make_hyperparameter_ranges(self.config_space)
-        self._excl_list = ExclusionList.empty_list(self._hp_ranges)
-
         if classifier_kwargs is None:
             classifier_kwargs = dict()
         if self.classifier == "xgboost":
@@ -167,18 +159,7 @@ def _loss(self, x):
         else:
             return y[:, 1]  # return probability of class 1
 
-    def _get_random_config(
-        self, exclusion_list: Optional[ExclusionList] = None
-    ) -> dict:
-        if exclusion_list is None:
-            exclusion_list = self._excl_list
-        return sample_random_configuration(
-            hp_ranges=self._hp_ranges,
-            random_state=self.random_state,
-            exclusion_list=exclusion_list,
-        )
-
-    def get_config(self, **kwargs):
+    def _get_config(self, **kwargs):
         start_time = time.time()
         config = self._next_initial_config()
         if config is None:
@@ -246,7 +227,6 @@ def wrapper(x):
                 f"config={config}] "
                 f"optimization time : {opt_time}"
             )
-            self._excl_list.add(config)  # Should not be suggested again
 
         return config
 

diff --git a/syne_tune/optimizer/schedulers/searchers/bore/multi_fidelity_bore.py b/syne_tune/optimizer/schedulers/searchers/bore/multi_fidelity_bore.py
@@ -49,6 +49,7 @@ def __init__(
         config_space: dict,
         metric: str,
         points_to_evaluate: Optional[List[dict]] = None,
+        allow_duplicates: Optional[bool] = None,
         mode: Optional[str] = None,
         gamma: Optional[float] = None,
         calibrate: Optional[bool] = None,
@@ -67,6 +68,7 @@ def __init__(
             config_space,
             metric=metric,
             points_to_evaluate=points_to_evaluate,
+            allow_duplicates=allow_duplicates,
             mode=mode,
             gamma=gamma,
             calibrate=calibrate,

diff --git a/syne_tune/optimizer/schedulers/searchers/botorch/botorch_searcher.py b/syne_tune/optimizer/schedulers/searchers/botorch/botorch_searcher.py
@@ -100,6 +100,11 @@ def __init__(
         self.trial_configs = dict()
         self.pending_trials = set()
         self.trial_observations = dict()
+        allow_duplicates = kwargs.get("allow_duplicates")
+        if allow_duplicates is not None and (not allow_duplicates):
+            logger.warning(
+                "This class does not support allow_duplicates argument. Sampling is with replacement"
+            )
 
     def _update(self, trial_id: str, config: dict, result: dict):
         trial_id = int(trial_id)

diff --git a/syne_tune/optimizer/schedulers/searchers/gp_fifo_searcher.py b/syne_tune/optimizer/schedulers/searchers/gp_fifo_searcher.py
@@ -155,6 +155,7 @@ def _create_internal(
         cost_attr: Optional[str] = None,
         resource_attr: Optional[str] = None,
         filter_observed_data: Optional[ConfigurationFilter] = None,
+        allow_duplicates: bool = False,
     ):
         self.hp_ranges = hp_ranges
         self.num_initial_candidates = num_initial_candidates
@@ -190,6 +191,7 @@ def _create_internal(
         self._cost_attr = cost_attr
         self._resource_attr = resource_attr
         self._filter_observed_data = filter_observed_data
+        self._allow_duplicates = allow_duplicates
         self._random_searcher = None
         # Tracks the cumulative time spent in ``get_config`` calls
         self.cumulative_get_config_time = 0
@@ -202,6 +204,7 @@ def _create_internal(
                 num_initial_random_choices
             )
             deb_msg += "- initial_scoring = {}\n".format(self.initial_scoring)
+            deb_msg += f"- allow_duplicates = {self._allow_duplicates}\n"
             logger.info(deb_msg)
 
     def _copy_kwargs_to_kwargs_int(self, kwargs_int: dict, kwargs: dict):
@@ -287,7 +290,7 @@ def _get_config_modelbased(
         self, exclusion_candidates: ExclusionList, **kwargs
     ) -> Optional[Configuration]:
         """
-        Implements ``get_config`` part if the surrogate model is used, instead
+        Implements :meth:`get_config` part if the surrogate model is used, instead
         of initial choices from ``points_to_evaluate`` or initial random
         choices.
 
@@ -298,10 +301,15 @@ def _get_config_modelbased(
         """
         raise NotImplementedError
 
-    def _get_exclusion_candidates(self, **kwargs) -> ExclusionList:
+    def _get_exclusion_candidates(self, skip_observed: bool = False) -> ExclusionList:
+        def skip_all(config: Configuration) -> bool:
+            return False
+
         return ExclusionList(
             self.state_transformer.state,
-            filter_observed_data=self._filter_observed_data,
+            filter_observed_data=skip_all
+            if skip_observed
+            else self._filter_observed_data,
         )
 
     def _should_pick_random_config(self, exclusion_candidates: ExclusionList) -> bool:
@@ -331,7 +339,9 @@ def _get_config_not_modelbased(
         model-based search. If False is returned, model-based search must be
         called.
 
-        :param exclusion_candidates: Configs to be avoided
+        :param exclusion_candidates: Configs to be avoided, even if
+            ``allow_duplicates == True`` (in this case, we avoid configs of
+            failed or pending trials)
         :return: ``(config, use_get_config_modelbased)``
         """
         self._assign_random_searcher()
@@ -376,8 +386,10 @@ def get_config(self, **kwargs) -> Optional[dict]:
             }
             self.profiler.begin_block(meta)
             self.profiler.start("all")
-            # Initial configs come from ``points_to_evaluate`` or are drawn at random
-        exclusion_candidates = self._get_exclusion_candidates(**kwargs)
+        # Initial configs come from ``points_to_evaluate`` or are drawn at random
+        # We use ``exclusion_candidates`` even if ``allow_duplicates == True``, in order
+        # to count how many unique configs have been suggested
+        exclusion_candidates = self._get_exclusion_candidates()
         config, pick_random = self._get_config_not_modelbased(exclusion_candidates)
         if self.debug_log is not None:
             trial_id = kwargs.get("trial_id")
@@ -386,8 +398,18 @@ def get_config(self, **kwargs) -> Optional[dict]:
             )
         if not pick_random:
             # Model-based decision
-            if not exclusion_candidates.config_space_exhausted():
-                config = self._get_config_modelbased(exclusion_candidates, **kwargs)
+            if self._allow_duplicates or (
+                not exclusion_candidates.config_space_exhausted()
+            ):
+                # Even if ``allow_duplicates == True``, we exclude configs which are
+                # pending or failed
+                if self._allow_duplicates:
+                    excl_cands = self._get_exclusion_candidates(skip_observed=True)
+                else:
+                    excl_cands = exclusion_candidates
+                config = self._get_config_modelbased(
+                    exclusion_candidates=excl_cands, **kwargs
+                )
 
         if config is not None:
             if self.debug_log is not None:
@@ -468,6 +490,7 @@ def _assign_random_searcher(self):
                 points_to_evaluate=[],
                 random_seed=0,
                 debug_log=False,
+                allow_duplicates=self._allow_duplicates,
             )
             self._random_searcher.set_random_state(self.random_state)
 
@@ -597,6 +620,9 @@ class GPFIFOSearcher(ModelBasedSearcher):
         ``opt_skip_init_length``, fitting is done only K-th call, and skipped
         otherwise. Defaults to 1 (no skipping)
     :type opt_skip_period: int, optional
+    :param allow_duplicates: If ``True``, :meth:`get_config` may return the same
+        configuration more than once. Defaults to ``False``
+    :type allow_duplicates: bool, optional
     :param map_reward: In the scheduler, the metric may be minimized or
         maximized, but internally, Bayesian optimization is minimizing
         the criterion. ``map_reward`` converts from metric to internal
@@ -738,7 +764,7 @@ def _postprocess_config(self, config: dict) -> dict:
     def _get_config_modelbased(
         self, exclusion_candidates, **kwargs
     ) -> Optional[Configuration]:
-        # Obtain current SurrogateModel from state transformer. Based on
+        # Obtain current :class:`SurrogateModel` from state transformer. Based on
         # this, the BO algorithm components can be constructed
         if self.do_profile:
             self.profiler.push_prefix("getconfig")
@@ -820,14 +846,16 @@ def get_batch_configs(
             if config is not None:
                 configs.append(config)
         else:
-            # ``DebugLogWriter`` does not support batch selection right now,
+            # :class:`DebugLogWriter` does not support batch selection right now,
             # must be switched off
             assert self.debug_log is None, (
                 "``get_batch_configs`` does not support debug_log right now. "
                 + "Please set ``debug_log=False`` in search_options argument "
                 + "of scheduler, or create your searcher with ``debug_log=False``"
             )
-            exclusion_candidates = self._get_exclusion_candidates(**kwargs)
+            exclusion_candidates = self._get_exclusion_candidates(
+                skip_observed=self._allow_duplicates
+            )
             pick_random = True
             while pick_random and len(configs) < batch_size:
                 config, pick_random = self._get_config_not_modelbased(
@@ -836,6 +864,8 @@ def get_batch_configs(
                 if pick_random:
                     if config is not None:
                         configs.append(config)
+                        # Even if ``allow_duplicates == True``, we don't want to have
+                        # duplicates in the same batch
                         exclusion_candidates.add(config)
                     else:
                         break  # Space exhausted
@@ -921,6 +951,7 @@ def _new_searcher_kwargs_for_clone(self) -> dict:
             cost_attr=self._cost_attr,
             resource_attr=self._resource_attr,
             filter_observed_data=self._filter_observed_data,
+            allow_duplicates=self._allow_duplicates,
         )
 
     def clone_from_state(self, state):

diff --git a/syne_tune/optimizer/schedulers/searchers/gp_searcher_factory.py b/syne_tune/optimizer/schedulers/searchers/gp_searcher_factory.py
@@ -445,7 +445,12 @@ def _create_common_objects(model=None, is_hypertune=False, **kwargs):
         )
     result["num_initial_candidates"] = kwargs["num_init_candidates"]
     result["num_initial_random_choices"] = kwargs["num_init_random"]
-    for k in ("initial_scoring", "cost_attr", "skip_local_optimization"):
+    for k in (
+        "initial_scoring",
+        "cost_attr",
+        "skip_local_optimization",
+        "allow_duplicates",
+    ):
         result[k] = kwargs[k]
 
     return result
@@ -797,6 +802,7 @@ def _common_defaults(
         "cost_attr": "elapsed_time",
         "normalize_targets": True,
         "no_fantasizing": False,
+        "allow_duplicates": False,
     }
     if is_hyperband:
         if is_hypertune:
@@ -834,6 +840,8 @@ def _common_defaults(
         "skip_local_optimization": Boolean(),
         "debug_log": Boolean(),
         "normalize_targets": Boolean(),
+        "no_fantasizing": Boolean(),
+        "allow_duplicates": Boolean(),
     }
 
     if is_hyperband: