Fix issue #905

* store the selector in the home directory of the user following https://specifications.freedesktop.org/basedir-spec/ This means that by default the selector is put into ~/.cache/auto-sklearn/ * make the AutoSklearn2Classifier picklable by replacing closures with callable classes * the initial issue using Lock objects does no longer exist as they were removed when we introduced dask for parallelism
automl · Oct 8, 2020 · 5d306bb · 5d306bb
1 parent 49b3750
commit 5d306bb
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 19 deletions.
diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
@@ -1,27 +1,43 @@
+import hashlib
 import json
 import os
+import pathlib
 import pickle
 from typing import Any, Dict, Optional, Union
 
 from ConfigSpace import Configuration
 import numpy as np
 import pandas as pd
+import sklearn
 
+import autosklearn
 from autosklearn.classification import AutoSklearnClassifier
 import autosklearn.experimental.selector
 from autosklearn.metrics import Scorer
 
-this_directory = os.path.abspath(os.path.dirname(__file__))
-selector_file = os.path.join(this_directory, 'selector.pkl')
-training_data_file = os.path.join(this_directory, 'askl2_training_data.json')
+this_directory = pathlib.Path(__file__).resolve().parent
+training_data_file = this_directory / 'askl2_training_data.json'
 with open(training_data_file) as fh:
     training_data = json.load(fh)
+    fh.seek(0)
+    m = hashlib.md5()
+    m.update(fh.read().encode('utf8'))
+training_data_hash = m.hexdigest()[:10]
+sklearn_version = sklearn.__version__
+autosklearn_version = autosklearn.__version__
+selector_file = pathlib.Path(
+    os.environ.get(
+        'XDG_CACHE_HOME',
+        '~/.cache/auto-sklearn/askl2_selector_%s_%s_%s.pkl'
+        % (autosklearn_version, sklearn_version, training_data_hash),
+    )
+).expanduser()
 metafeatures = pd.DataFrame(training_data['metafeatures'])
 y_values = np.array(training_data['y_values'])
 strategies = training_data['strategies']
 minima_for_methods = training_data['minima_for_methods']
 maxima_for_methods = training_data['maxima_for_methods']
-if not os.path.exists(selector_file):
+if not selector_file.exists():
     selector = autosklearn.experimental.selector.OneVSOneSelector(
         configuration=training_data['configuration'],
         default_strategy_idx=strategies.index('RF_SH-eta4-i_holdout_iterative_es_if'),
@@ -34,12 +50,17 @@
         minima=minima_for_methods,
         maxima=maxima_for_methods,
     )
+    selector_file.parent.mkdir(exist_ok=True, parents=True)
     with open(selector_file, 'wb') as fh:
         pickle.dump(selector, fh)
 
 
-def get_smac_object_callback(portfolio):
-    def get_smac_object(
+class SmacObjectCallback:
+    def __init__(self, portfolio):
+        self.portfolio = portfolio
+
+    def __call__(
+        self,
         scenario_dict,
         seed,
         ta,
@@ -56,7 +77,7 @@ def get_smac_object(
 
         initial_configurations = [
             Configuration(configuration_space=scenario.cs, values=member)
-            for member in portfolio.values()]
+            for member in self.portfolio.values()]
 
         rh2EPM = RunHistory2EPM4LogCost
         return SMAC4AC(
@@ -70,11 +91,17 @@ def get_smac_object(
             n_jobs=n_jobs,
             dask_client=dask_client,
         )
-    return get_smac_object
 
 
-def get_sh_object_callback(budget_type, eta, initial_budget, portfolio):
-    def get_smac_object(
+class SHObjectCallback:
+    def __init__(self, budget_type, eta, initial_budget, portfolio):
+        self.budget_type = budget_type
+        self.eta = eta
+        self.initial_budget = initial_budget
+        self.portfolio = portfolio
+
+    def __call__(
+        self,
         scenario_dict,
         seed,
         ta,
@@ -91,10 +118,10 @@ def get_smac_object(
         scenario = Scenario(scenario_dict)
         initial_configurations = [
             Configuration(configuration_space=scenario.cs, values=member)
-            for member in portfolio.values()]
+            for member in self.portfolio.values()]
 
         rh2EPM = RunHistory2EPM4LogCost
-        ta_kwargs['budget_type'] = budget_type
+        ta_kwargs['budget_type'] = self.budget_type
 
         smac4ac = SMAC4AC(
             scenario=scenario,
@@ -106,9 +133,9 @@ def get_smac_object(
             run_id=seed,
             intensifier=SuccessiveHalving,
             intensifier_kwargs={
-                'initial_budget': initial_budget,
+                'initial_budget': self.initial_budget,
                 'max_budget': 100,
-                'eta': eta,
+                'eta': self.eta,
                 'min_chall': 1,
             },
             dask_client=dask_client,
@@ -118,7 +145,6 @@ def get_smac_object(
             len(scenario.cs.get_hyperparameters()) / 2
         )
         return smac4ac
-    return get_smac_object
 
 
 class AutoSklearn2Classifier(AutoSklearnClassifier):
@@ -236,15 +262,15 @@ def fit(self, X, y,
         else:
             resampling_strategy_kwargs = None
 
-        portfolio_file = os.path.join(this_directory, 'askl2_portfolios', '%s.json' % automl_policy)
+        portfolio_file = this_directory / 'askl2_portfolios' / ('%s.json' % automl_policy)
         with open(portfolio_file) as fh:
             portfolio_json = json.load(fh)
         portfolio = portfolio_json['portfolio']
 
         if setting['fidelity'] == 'SH':
-            smac_callback = get_sh_object_callback('iterations', 4, 5.0, portfolio)
+            smac_callback = SHObjectCallback('iterations', 4, 5.0, portfolio)
         else:
-            smac_callback = get_smac_object_callback(portfolio)
+            smac_callback = SmacObjectCallback(portfolio)
 
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_arguments = resampling_strategy_kwargs

diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py
@@ -651,7 +651,7 @@ def test_regression_methods_returns_self(self):
 
 class AutoSklearn2ClassifierTest(unittest.TestCase):
     # Currently this class only tests that the methods of AutoSklearnClassifier
-    # which should return self actually return self.
+    # which should return self actually return self and can be pickled.
     def test_classification_methods_returns_self(self):
         X_train, y_train, X_test, y_test = putil.get_dataset('iris')
         automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0,)
@@ -665,6 +665,11 @@ def test_classification_methods_returns_self(self):
         automl_refitted = automl.refit(X_train.copy(), y_train.copy())
         self.assertIs(automl, automl_refitted)
 
+        predictions = automl_fitted.predict(X_test)
+        self.assertGreaterEqual(sklearn.metrics.accuracy_score(y_test, predictions), 2 / 3)
+
+        pickle.dumps(automl_fitted)
+
 
 if __name__ == "__main__":
     unittest.main()