From 4587d6abdd83bed30df86eebdc84862773c6fb51 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 16 Sep 2020 16:38:00 +0200 Subject: [PATCH 1/2] Update regression * fix iterative regressors * fix a bug in histgradientboosting * make extra trees and random forest regressors iterative * add unit test for regression meta data construction --- .../metalearning/metafeatures/metafeatures.py | 13 +- .../components/classification/extra_trees.py | 54 ++++-- .../components/regression/extra_trees.py | 24 +-- .../regression/gradient_boosting.py | 147 ++++++++------ .../components/regression/random_forest.py | 21 +- .../components/regression/ridge_regression.py | 61 ------ .../pipeline/components/regression/sgd.py | 13 +- scripts/01_create_commands.py | 17 +- scripts/03_calculate_metafeatures.py | 22 ++- ...un_auto-sklearn_for_metadata_generation.py | 20 +- scripts/update_metadata_util.py | 14 +- test/test_automl/test_estimators.py | 21 +- .../regression/test_gradient_boosting.py | 12 +- .../regression/test_ridge_regression.py | 24 --- test/test_pipeline/test_regression.py | 4 +- test/test_scripts/test_metadata_generation.py | 182 ++++++++++++++++-- 16 files changed, 396 insertions(+), 253 deletions(-) delete mode 100644 autosklearn/pipeline/components/regression/ridge_regression.py delete mode 100644 test/test_pipeline/components/regression/test_ridge_regression.py diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index 51bc7cb227..d39c1c244d 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -8,6 +8,7 @@ # TODO use balanced accuracy! from sklearn.utils import check_array from sklearn.multiclass import OneVsRestClassifier +from sklearn.utils.multiclass import type_of_target from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor @@ -654,7 +655,7 @@ def _calculate(self, X, y, categorical): class LandmarkLDA(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.discriminant_analysis - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) @@ -690,7 +691,7 @@ class LandmarkNaiveBayes(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.naive_bayes - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) @@ -719,7 +720,7 @@ class LandmarkDecisionTree(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.tree - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) @@ -755,7 +756,7 @@ class LandmarkDecisionNodeLearner(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.tree - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) @@ -784,7 +785,7 @@ class LandmarkRandomNodeLearner(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.tree - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) @@ -836,7 +837,7 @@ class Landmark1NN(MetaFeature): def _calculate(self, X, y, categorical): import sklearn.neighbors - if len(y.shape) == 1 or y.shape[1] == 1: + if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) diff --git a/autosklearn/pipeline/components/classification/extra_trees.py b/autosklearn/pipeline/components/classification/extra_trees.py index 83f8e1943d..bc3eb5e4d7 100644 --- a/autosklearn/pipeline/components/classification/extra_trees.py +++ b/autosklearn/pipeline/components/classification/extra_trees.py @@ -23,30 +23,19 @@ def __init__(self, criterion, min_samples_leaf, class_weight=None): self.n_estimators = self.get_max_iter() - if criterion not in ("gini", "entropy"): - raise ValueError("'criterion' is not in ('gini', 'entropy'): " - "%s" % criterion) self.criterion = criterion - - if check_none(max_depth): - self.max_depth = None - else: - self.max_depth = int(max_depth) - if check_none(max_leaf_nodes): - self.max_leaf_nodes = None - else: - self.max_leaf_nodes = int(max_leaf_nodes) - - self.min_samples_leaf = int(min_samples_leaf) - self.min_samples_split = int(min_samples_split) - self.max_features = float(max_features) - self.bootstrap = check_for_bool(bootstrap) - self.min_weight_fraction_leaf = float(min_weight_fraction_leaf) - self.min_impurity_decrease = float(min_impurity_decrease) + self.max_depth = max_depth + self.max_leaf_nodes = max_leaf_nodes + self.min_samples_leaf = min_samples_leaf + self.min_samples_split = min_samples_split + self.max_features = max_features + self.bootstrap = bootstrap + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.min_impurity_decrease = min_impurity_decrease self.oob_score = oob_score - self.n_jobs = int(n_jobs) + self.n_jobs = n_jobs self.random_state = random_state - self.verbose = int(verbose) + self.verbose = verbose self.class_weight = class_weight self.estimator = None @@ -65,6 +54,29 @@ def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): if self.estimator is None: max_features = int(X.shape[1] ** float(self.max_features)) + if self.criterion not in ("gini", "entropy"): + raise ValueError("'criterion' is not in ('gini', 'entropy'): " + "%s" % self.criterion) + + if check_none(self.max_depth): + self.max_depth = None + else: + self.max_depth = int(self.max_depth) + if check_none(self.max_leaf_nodes): + self.max_leaf_nodes = None + else: + self.max_leaf_nodes = int(self.max_leaf_nodes) + + self.min_samples_leaf = int(self.min_samples_leaf) + self.min_samples_split = int(self.min_samples_split) + self.max_features = float(self.max_features) + self.min_impurity_decrease = float(self.min_impurity_decrease) + self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) + self.oob_score = check_for_bool(self.oob_score) + self.bootstrap = check_for_bool(self.bootstrap) + self.n_jobs = int(self.n_jobs) + self.verbose = int(self.verbose) + self.estimator = ETC(n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, diff --git a/autosklearn/pipeline/components/regression/extra_trees.py b/autosklearn/pipeline/components/regression/extra_trees.py index e7ff30b7f7..9b55205372 100644 --- a/autosklearn/pipeline/components/regression/extra_trees.py +++ b/autosklearn/pipeline/components/regression/extra_trees.py @@ -16,18 +16,19 @@ class ExtraTreesRegressor( ): def __init__(self, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, max_leaf_nodes, - max_depth, min_impurity_decrease, oob_score=False, n_jobs=1, - random_state=None, verbose=0): + max_depth, min_weight_fraction_leaf, min_impurity_decrease, + oob_score=False, n_jobs=1, random_state=None, verbose=0): self.n_estimators = self.get_max_iter() self.criterion = criterion - self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.max_features = max_features - self.min_impurity_decrease = min_impurity_decrease self.bootstrap = bootstrap + self.max_depth = max_depth + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.min_impurity_decrease = min_impurity_decrease self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state @@ -38,6 +39,9 @@ def __init__(self, criterion, min_samples_leaf, def get_max_iter(): return 512 + def get_current_iter(self): + return self.estimator.n_estimators + def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesRegressor as ETR @@ -45,7 +49,6 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): self.estimator = None if self.estimator is None: - self.n_estimators = int(self.n_estimators) if self.criterion not in ("mse", "friedman_mse", "mae"): raise ValueError( @@ -66,6 +69,8 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.min_impurity_decrease = float(self.min_impurity_decrease) + self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) + self.oob_score = check_for_bool(self.oob_score) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) @@ -78,6 +83,7 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): bootstrap=self.bootstrap, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, @@ -103,11 +109,6 @@ def predict(self, X): raise NotImplementedError return self.estimator.predict(X) - def predict_proba(self, X): - if self.estimator is None: - raise NotImplementedError() - return self.estimator.predict_proba(X) - @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ET', @@ -131,6 +132,7 @@ def get_hyperparameter_search_space(dataset_properties=None): "max_features", 0.1, 1.0, default_value=1) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") + min_weight_fraction_leaf = UnParametrizedHyperparameter('min_weight_fraction_leaf', 0.) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter( @@ -146,7 +148,7 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameters([criterion, max_features, max_depth, max_leaf_nodes, min_samples_split, - min_samples_leaf, min_impurity_decrease, + min_samples_leaf, min_impurity_decrease, min_weight_fraction_leaf, bootstrap]) return cs diff --git a/autosklearn/pipeline/components/regression/gradient_boosting.py b/autosklearn/pipeline/components/regression/gradient_boosting.py index b2f259c6a9..a74dfd37cd 100644 --- a/autosklearn/pipeline/components/regression/gradient_boosting.py +++ b/autosklearn/pipeline/components/regression/gradient_boosting.py @@ -6,19 +6,25 @@ UnParametrizedHyperparameter from ConfigSpace.conditions import EqualsCondition, InCondition -from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm +from autosklearn.pipeline.components.base import ( + AutoSklearnRegressionAlgorithm, + IterativeComponent, +) from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS from autosklearn.util.common import check_none -class GradientBoosting(AutoSklearnRegressionAlgorithm): - def __init__(self, loss, learning_rate, max_iter, min_samples_leaf, max_depth, +class GradientBoosting( + IterativeComponent, + AutoSklearnRegressionAlgorithm, +): + def __init__(self, loss, learning_rate, min_samples_leaf, max_depth, max_leaf_nodes, max_bins, l2_regularization, early_stop, tol, scoring, n_iter_no_change=0, validation_fraction=None, random_state=None, verbose=0): self.loss = loss self.learning_rate = learning_rate - self.max_iter = max_iter + self.max_iter = self.get_max_iter() self.min_samples_leaf = min_samples_leaf self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes @@ -32,64 +38,96 @@ def __init__(self, loss, learning_rate, max_iter, min_samples_leaf, max_depth, self.random_state = random_state self.verbose = verbose self.estimator = None + self.fully_fit_ = False - def fit(self, X, y): + @staticmethod + def get_max_iter(): + return 512 + + def get_current_iter(self): + return self.estimator.n_iter_ + + def iterative_fit(self, X, y, n_iter=2, refit=False): + + """ + Set n_iter=2 for the same reason as for SGD + """ import sklearn.ensemble from sklearn.experimental import enable_hist_gradient_boosting # noqa - # Special fix for gradient boosting! - if isinstance(X, np.ndarray): - X = np.ascontiguousarray(X, dtype=X.dtype) + if refit: + self.estimator = None - self.learning_rate = float(self.learning_rate) - self.max_iter = int(self.max_iter) - self.min_samples_leaf = int(self.min_samples_leaf) - if check_none(self.max_depth): - self.max_depth = None - else: - self.max_depth = int(self.max_depth) - if check_none(self.max_leaf_nodes): - self.max_leaf_nodes = None - else: - self.max_leaf_nodes = int(self.max_leaf_nodes) - self.max_bins = int(self.max_bins) - self.l2_regularization = float(self.l2_regularization) - self.tol = float(self.tol) - if check_none(self.scoring): - self.scoring = None - if self.early_stop == "off": - self.n_iter_no_change = 0 - self.validation_fraction = None - elif self.early_stop == "train": - self.n_iter_no_change = int(self.n_iter_no_change) - self.validation_fraction = None - elif self.early_stop == "valid": - self.n_iter_no_change = int(self.n_iter_no_change) - self.validation_fraction = float(self.validation_fraction) + if self.estimator is None: + self.fully_fit_ = False + self.learning_rate = float(self.learning_rate) + self.max_iter = int(self.max_iter) + self.min_samples_leaf = int(self.min_samples_leaf) + if check_none(self.max_depth): + self.max_depth = None + else: + self.max_depth = int(self.max_depth) + if check_none(self.max_leaf_nodes): + self.max_leaf_nodes = None + else: + self.max_leaf_nodes = int(self.max_leaf_nodes) + self.max_bins = int(self.max_bins) + self.l2_regularization = float(self.l2_regularization) + self.tol = float(self.tol) + if check_none(self.scoring): + self.scoring = None + if self.early_stop == "off": + self.n_iter_no_change = 0 + self.validation_fraction_ = None + elif self.early_stop == "train": + self.n_iter_no_change = int(self.n_iter_no_change) + self.validation_fraction_ = None + elif self.early_stop == "valid": + self.n_iter_no_change = int(self.n_iter_no_change) + self.validation_fraction_ = float(self.validation_fraction) + else: + raise ValueError("early_stop should be either off, train or valid") + self.verbose = int(self.verbose) + n_iter = int(np.ceil(n_iter)) + + self.estimator = sklearn.ensemble.HistGradientBoostingRegressor( + loss=self.loss, + learning_rate=self.learning_rate, + max_iter=n_iter, + min_samples_leaf=self.min_samples_leaf, + max_depth=self.max_depth, + max_leaf_nodes=self.max_leaf_nodes, + max_bins=self.max_bins, + l2_regularization=self.l2_regularization, + tol=self.tol, + scoring=self.scoring, + n_iter_no_change=self.n_iter_no_change, + validation_fraction=self.validation_fraction_, + verbose=self.verbose, + warm_start=True, + random_state=self.random_state, + ) else: - raise ValueError("early_stop should be either off, train or valid") - self.verbose = int(self.verbose) - - self.estimator = sklearn.ensemble.HistGradientBoostingRegressor( - loss=self.loss, - learning_rate=self.learning_rate, - max_iter=self.max_iter, - min_samples_leaf=self.min_samples_leaf, - max_depth=self.max_depth, - max_leaf_nodes=self.max_leaf_nodes, - max_bins=self.max_bins, - l2_regularization=self.l2_regularization, - tol=self.tol, - scoring=self.scoring, - n_iter_no_change=self.n_iter_no_change, - validation_fraction=self.validation_fraction, - verbose=self.verbose, - random_state=self.random_state, - ) + self.estimator.max_iter += n_iter + self.estimator.max_iter = min(self.estimator.max_iter, + self.max_iter) self.estimator.fit(X, y) + + if self.estimator.max_iter >= self.max_iter \ + or self.estimator.max_iter > self.estimator.n_iter_: + self.fully_fit_ = True + return self + def configuration_fully_fitted(self): + if self.estimator is None: + return False + elif not hasattr(self, 'fully_fit_'): + return False + else: + return self.fully_fit_ + def predict(self, X): if self.estimator is None: raise NotImplementedError @@ -104,7 +142,6 @@ def get_properties(dataset_properties=None): 'handles_multiclass': False, 'handles_multilabel': False, 'handles_multioutput': False, - 'prefers_data_normalized': False, 'is_deterministic': True, 'input': (DENSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @@ -116,8 +153,6 @@ def get_hyperparameter_search_space(dataset_properties=None): "loss", ["least_squares"], default_value="least_squares") learning_rate = UniformFloatHyperparameter( name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) - max_iter = UniformIntegerHyperparameter( - "max_iter", 32, 512, default_value=100) min_samples_leaf = UniformIntegerHyperparameter( name="min_samples_leaf", lower=1, upper=200, default_value=20, log=True) max_depth = UnParametrizedHyperparameter( @@ -138,7 +173,7 @@ def get_hyperparameter_search_space(dataset_properties=None): validation_fraction = UniformFloatHyperparameter( name="validation_fraction", lower=0.01, upper=0.4, default_value=0.1) - cs.add_hyperparameters([loss, learning_rate, max_iter, min_samples_leaf, + cs.add_hyperparameters([loss, learning_rate, min_samples_leaf, max_depth, max_leaf_nodes, max_bins, l2_regularization, early_stop, tol, scoring, n_iter_no_change, validation_fraction]) diff --git a/autosklearn/pipeline/components/regression/random_forest.py b/autosklearn/pipeline/components/regression/random_forest.py index e3482e4ea2..054c283dc5 100644 --- a/autosklearn/pipeline/components/regression/random_forest.py +++ b/autosklearn/pipeline/components/regression/random_forest.py @@ -36,6 +36,9 @@ def __init__(self, criterion, max_features, def get_max_iter(): return 512 + def get_current_iter(self): + return self.estimator.n_estimators + def iterative_fit(self, X, y, n_iter=1, refit=False): from sklearn.ensemble import RandomForestRegressor @@ -48,18 +51,23 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): self.max_depth = None else: self.max_depth = int(self.max_depth) + self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) + self.max_features = float(self.max_features) + self.bootstrap = check_for_bool(self.bootstrap) + if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) + self.min_impurity_decrease = float(self.min_impurity_decrease) self.estimator = RandomForestRegressor( - n_estimators=0, + n_estimators=n_iter, criterion=self.criterion, max_features=self.max_features, max_depth=self.max_depth, @@ -72,9 +80,10 @@ def iterative_fit(self, X, y, n_iter=1, refit=False): random_state=self.random_state, n_jobs=self.n_jobs, warm_start=True) - self.estimator.n_estimators += n_iter - self.estimator.n_estimators = min(self.estimator.n_estimators, - self.n_estimators) + else: + self.estimator.n_estimators += n_iter + self.estimator.n_estimators = min(self.estimator.n_estimators, + self.n_estimators) self.estimator.fit(X, y) return self @@ -109,8 +118,12 @@ def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() criterion = CategoricalHyperparameter("criterion", ['mse', 'friedman_mse', 'mae']) + + # In contrast to the random forest classifier we want to use more max_features + # and therefore have this not on a sqrt scale max_features = UniformFloatHyperparameter( "max_features", 0.1, 1.0, default_value=1.0) + max_depth = UnParametrizedHyperparameter("max_depth", "None") min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) diff --git a/autosklearn/pipeline/components/regression/ridge_regression.py b/autosklearn/pipeline/components/regression/ridge_regression.py deleted file mode 100644 index 3da5a435df..0000000000 --- a/autosklearn/pipeline/components/regression/ridge_regression.py +++ /dev/null @@ -1,61 +0,0 @@ -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import UniformFloatHyperparameter, UnParametrizedHyperparameter - -from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm -from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, PREDICTIONS, SPARSE -from autosklearn.util.common import check_for_bool - - -class RidgeRegression(AutoSklearnRegressionAlgorithm): - def __init__(self, alpha, fit_intercept, tol, random_state=None): - self.alpha = alpha - self.fit_intercept = fit_intercept - self.tol = tol - self.random_state = random_state - self.estimator = None - - def fit(self, X, Y): - import sklearn.linear_model - - self.alpha = float(self.alpha) - self.fit_intercept = check_for_bool(self.fit_intercept) - self.tol = float(self.tol) - - self.estimator = sklearn.linear_model.Ridge(alpha=self.alpha, - fit_intercept=self.fit_intercept, - tol=self.tol, - copy_X=True, - normalize=False, - random_state=self.random_state) - self.estimator.fit(X, Y) - return self - - def predict(self, X): - if self.estimator is None: - raise NotImplementedError - return self.estimator.predict(X) - - @staticmethod - def get_properties(dataset_properties=None): - return {'shortname': 'Rigde', - 'name': 'Ridge Regression', - 'handles_regression': True, - 'handles_classification': False, - 'handles_multiclass': False, - 'handles_multilabel': False, - 'handles_multioutput': True, - 'prefers_data_normalized': True, - 'is_deterministic': True, - 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,)} - - @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): - cs = ConfigurationSpace() - alpha = UniformFloatHyperparameter( - "alpha", 10 ** -5, 10., log=True, default_value=1.) - fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") - tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, - default_value=1e-3, log=True) - cs.add_hyperparameters([alpha, fit_intercept, tol]) - return cs diff --git a/autosklearn/pipeline/components/regression/sgd.py b/autosklearn/pipeline/components/regression/sgd.py index c8b91fb931..e3bbf2b12a 100644 --- a/autosklearn/pipeline/components/regression/sgd.py +++ b/autosklearn/pipeline/components/regression/sgd.py @@ -39,6 +39,9 @@ def __init__(self, loss, penalty, alpha, fit_intercept, tol, def get_max_iter(): return 1024 + def get_current_iter(self): + return self.n_iter_ + def iterative_fit(self, X, y, n_iter=2, refit=False): from sklearn.linear_model import SGDRegressor import sklearn.preprocessing @@ -56,10 +59,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.scaler = None if self.estimator is None: + self.fully_fit_ = False self.alpha = float(self.alpha) - self.fit_intercept = check_for_bool(self.fit_intercept) - self.tol = float(self.tol) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( @@ -68,6 +70,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = check_for_bool(self.average) + self.fit_intercept = check_for_bool(self.fit_intercept) + self.tol = float(self.tol) + self.estimator = SGDRegressor(loss=self.loss, penalty=self.penalty, alpha=self.alpha, @@ -88,6 +93,7 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.scaler.fit(y.reshape((-1, 1))) Y_scaled = self.scaler.transform(y.reshape((-1, 1))).ravel() self.estimator.fit(X, Y_scaled) + self.n_iter_ = self.estimator.n_iter_ else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter) @@ -104,8 +110,9 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): coef_init=None, intercept_init=None ) + self.n_iter_ += self.estimator.n_iter_ - if self.estimator.max_iter >= self.max_iter or n_iter > self.estimator.n_iter_: + if self.estimator.max_iter >= self.max_iter or self.estimator.max_iter > self.n_iter_: self.fully_fit_ = True return self diff --git a/scripts/01_create_commands.py b/scripts/01_create_commands.py index 0a4536c7b5..d637c97cfe 100644 --- a/scripts/01_create_commands.py +++ b/scripts/01_create_commands.py @@ -1,4 +1,5 @@ import argparse +import itertools import os import sys @@ -8,28 +9,20 @@ parser = argparse.ArgumentParser() parser.add_argument('--working-directory', type=str, required=True) -parser.add_argument('--task-type', required=True, - choices=['classification', 'regression']) args = parser.parse_args() working_directory = args.working_directory -task_type = args.task_type -if task_type == 'classification': - tasks = classification_tasks - command_file_name = os.path.join(working_directory, 'umd-cls.txt') -else: - tasks = regression_tasks - command_file_name = os.path.join(working_directory, 'umd-rs.txt') +command_file_name = os.path.join(working_directory, 'metadata_commands.txt') this_directory = os.path.dirname(os.path.abspath(__file__)) script_name = 'run_auto-sklearn_for_metadata_generation.py' absolute_script_name = os.path.join(this_directory, script_name) commands = [] -for task_id in tasks: +for task_id in itertools.chain(classification_tasks, regression_tasks): command = ('python3 %s --working-directory %s --time-limit 86400 ' - '--per-run-time-limit 1800 --task-id %d --task-type %s -s 1' % - (absolute_script_name, working_directory, task_id, task_type)) + '--per-run-time-limit 1800 --task-id %d -s 1' % + (absolute_script_name, working_directory, task_id)) commands.append(command) with open(command_file_name, 'w') as fh: diff --git a/scripts/03_calculate_metafeatures.py b/scripts/03_calculate_metafeatures.py index f0f1d25712..062c746b7d 100644 --- a/scripts/03_calculate_metafeatures.py +++ b/scripts/03_calculate_metafeatures.py @@ -14,19 +14,20 @@ from autosklearn.data.abstract_data_manager import perform_one_hot_encoding from autosklearn.metalearning.metafeatures import metafeatures, metafeature +from autosklearn.smbo import EXCLUDE_META_FEATURES_CLASSIFICATION, EXCLUDE_META_FEATURES_REGRESSION sys.path.append('.') from update_metadata_util import load_task, classification_tasks, \ regression_tasks -def calculate_metafeatures(task_id): +def calculate_metafeatures(task_id, exclude): print(task_id) - X_train, y_train, X_test, y_test, cat = load_task(task_id) + X_train, y_train, X_test, y_test, cat, _ = load_task(task_id) categorical = [True if 'categorical' == c else False for c in cat] _metafeatures_labels = metafeatures.calculate_all_metafeatures_with_labels( - X_train, y_train, [False] * X_train.shape[1], task_id) + X_train, y_train, [False] * X_train.shape[1], task_id, dont_calculate=exclude) X_train, sparse = perform_one_hot_encoding(scipy.sparse.issparse(X_train), categorical, [X_train]) @@ -37,7 +38,7 @@ def calculate_metafeatures(task_id): obj = pynisher.enforce_limits(mem_in_mb=3072)( metafeatures.calculate_all_metafeatures_encoded_labels) _metafeatures_encoded_labels = obj(X_train, y_train, - categorical, task_id) + categorical, task_id, dont_calculate=exclude) end_time = time.time() if obj.exit_status == pynisher.MemorylimitException: @@ -74,7 +75,7 @@ def calculate_metafeatures(task_id): parser.add_argument("--n-jobs", help="Compute metafeatures in parallel if possible.", type=int, default=1) - parser.add_argument("--test-mode", type=bool, default=False) + parser.add_argument("--test-mode", action='store_true') args = parser.parse_args() working_directory = args.working_directory @@ -99,6 +100,9 @@ def calculate_metafeatures(task_id): if test_mode: tasks = [tasks[0]] + EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ + if task_type == 'classification' else EXCLUDE_META_FEATURES_REGRESSION + tasks = copy.deepcopy(tasks) np.random.shuffle(tasks) @@ -106,10 +110,10 @@ def producer(): for task_id in tasks: yield task_id - memory = joblib.Memory(cachedir='/tmp/joblib', verbose=10) + memory = joblib.Memory(location='/tmp/joblib', verbose=10) cached_calculate_metafeatures = memory.cache(calculate_metafeatures) mfs = joblib.Parallel(n_jobs=args.n_jobs) \ - (joblib.delayed(cached_calculate_metafeatures)(task_id) + (joblib.delayed(cached_calculate_metafeatures)(task_id, EXCLUDE_META_FEATURES) for task_id in producer()) for mf in mfs: @@ -168,6 +172,10 @@ def producer(): feature_steps = defaultdict(list) metafeature_names = list() for metafeature_name in metafeatures.metafeatures.functions: + + if metafeature_name in EXCLUDE_META_FEATURES: + continue + dependency = metafeatures.metafeatures.get_dependency(metafeature_name) if dependency is not None: feature_steps[dependency].append(metafeature_name) diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py index 5fcd121d59..72bdee885f 100644 --- a/scripts/run_auto-sklearn_for_metadata_generation.py +++ b/scripts/run_auto-sklearn_for_metadata_generation.py @@ -22,8 +22,6 @@ parser.add_argument('--time-limit', type=int, required=True) parser.add_argument('--per-run-time-limit', type=int, required=True) parser.add_argument('--task-id', type=int, required=True) -parser.add_argument('--task-type', choices=['classification', 'regression'], - required=True) parser.add_argument('-s', '--seed', type=int, required=True) parser.add_argument('--unittest', action='store_true') args = parser.parse_args() @@ -32,10 +30,11 @@ time_limit = args.time_limit per_run_time_limit = args.per_run_time_limit task_id = args.task_id -task_type = args.task_type seed = args.seed is_test = args.unittest +X_train, y_train, X_test, y_test, cat, task_type = load_task(task_id) + configuration_output_dir = os.path.join(working_directory, 'configuration', task_type) try: @@ -59,15 +58,20 @@ } if is_test: - automl_arguments['include_estimators'] = ['sgd'] - automl_arguments['resampling_strategy_arguments'] = {'folds': 3} - include = {'classifier': ['sgd']} + automl_arguments['resampling_strategy_arguments'] = {'folds': 2} + if task_type == 'classification': + automl_arguments['include_estimators'] = ['sgd'] + include = {'classifier': ['sgd']} + elif task_type == 'regression': + automl_arguments['include_estimators'] = ['extra_trees'] + automl_arguments['include_preprocessors'] = ['no_preprocessing'] + include = {'regressor': ['extra_trees'], 'feature_preprocessor': ['no_preprocessing']} + else: + raise ValueError('Unsupported task type: %s' % str(task_type)) else: automl_arguments['resampling_strategy_arguments'] = {'folds': 10} include = None -X_train, y_train, X_test, y_test, cat = load_task(task_id) - if task_type == 'classification': automl_arguments['metric'] = balanced_accuracy automl = AutoSklearnClassifier(**automl_arguments) diff --git a/scripts/update_metadata_util.py b/scripts/update_metadata_util.py index 139cd3e752..285b23cc05 100644 --- a/scripts/update_metadata_util.py +++ b/scripts/update_metadata_util.py @@ -18,7 +18,7 @@ 75181, 75187, 75250, 75249, 75248, 75243, 75244, 75182] regression_tasks = [2280, 2288, 2289, 2292, 2300, 2306, 2307, 2309, 2313, 2315, 4768, 4769, 4772, 4774, 4779, 4790, 4796, 4835, - 4840, 4881, 4883, 4885, 4892, 4893, 5022, 5024, 7393] + 4881, 4883, 4885, 4892, 4893, 5022, 5024, 7393] def load_task(task_id): @@ -35,9 +35,11 @@ def load_task(task_id): del dataset cat = ['categorical' if c else 'numerical' for c in cat] - unique = np.unique(y_train) - mapping = {unique_value: i for i, unique_value in enumerate(unique)} - y_train = np.array([mapping[value] for value in y_train]) - y_test = np.array([mapping[value] for value in y_test]) + if isinstance(task, openml.tasks.OpenMLClassificationTask): + task_type = 'classification' + elif isinstance(task, openml.tasks.OpenMLRegressionTask): + task_type = 'regression' + else: + raise ValueError('Unknown task type') - return X_train, y_train, X_test, y_test, cat + return X_train, y_train, X_test, y_test, cat, task_type diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index c29bc81573..5ade4cc124 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -718,26 +718,25 @@ def test_cv_regression(self): Makes sure that when using a cv strategy, we are able to fit a regressor """ - tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') - output = os.path.join(self.test_dir, '..', '.out_regression_fit') + tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv') + output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv') self._setUp(tmp) self._setUp(output) - X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') - automl = AutoSklearnRegressor(time_left_for_this_task=30, - per_run_time_limit=5, + X_train, Y_train, X_test, Y_test = putil.get_dataset('boston', train_size_maximum=300) + automl = AutoSklearnRegressor(time_left_for_this_task=60, + per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) - self.assertEqual(predictions.shape, (356,)) - score = mean_squared_error(Y_test, predictions) - # On average np.sqrt(30) away from the target -> ~5.5 on average - # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds - # constraint. With more time_left_for_this_task this is no longer an issue - self.assertGreaterEqual(score, -37) + self.assertEqual(predictions.shape, (206,)) + score = r2(Y_test, predictions) + print(Y_test) + print(predictions) + self.assertGreaterEqual(score, 0.1) self._tearDown(tmp) self._tearDown(output) diff --git a/test/test_pipeline/components/regression/test_gradient_boosting.py b/test/test_pipeline/components/regression/test_gradient_boosting.py index 26fdd73e38..9fcb2cd623 100644 --- a/test/test_pipeline/components/regression/test_gradient_boosting.py +++ b/test/test_pipeline/components/regression/test_gradient_boosting.py @@ -11,14 +11,14 @@ class GradientBoostingComponentTest(BaseRegressionComponentTest): __test__ = True res = dict() - res["default_boston"] = 0.7677915076432977 + res["default_boston"] = 0.7491382574462079 + res["default_boston_iterative"] = 0.7491382574462079 res["default_boston_sparse"] = None - res["default_diabetes"] = 0.3657740311481247 + res["boston_n_calls"] = 9 + res["default_diabetes"] = 0.2872735632261877 + res["default_diabetes_iterative"] = 0.2872735632261877 res["default_diabetes_sparse"] = None + res["diabetes_n_call"] = 11 sk_mod = sklearn.ensemble.GradientBoostingRegressor module = GradientBoosting - step_hyperparameter = { - 'name': 'n_estimators', - 'value': 100, - } diff --git a/test/test_pipeline/components/regression/test_ridge_regression.py b/test/test_pipeline/components/regression/test_ridge_regression.py deleted file mode 100644 index 49205a4edb..0000000000 --- a/test/test_pipeline/components/regression/test_ridge_regression.py +++ /dev/null @@ -1,24 +0,0 @@ -import sklearn.linear_model - -from autosklearn.pipeline.components.regression.ridge_regression import \ - RidgeRegression -from .test_base import BaseRegressionComponentTest - - -class RidgeComponentTest(BaseRegressionComponentTest): - __test__ = True - - res = dict() - res["default_boston"] = 0.7035465377559671 - res["default_boston_iterative"] = None - res["default_boston_sparse"] = 0.1163004000785135 - res["default_boston_sparse_places"] = 4 - res["default_boston_iterative_sparse"] = None - res["default_diabetes"] = 0.32614416980439365 - res["default_diabetes_iterative"] = None - res["default_diabetes_sparse"] = 0.12989753681434824 - res["default_diabetes_iterative_sparse"] = None - - sk_mod = sklearn.linear_model.Ridge - - module = RidgeRegression diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index e6450d2275..de730bc604 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -275,7 +275,7 @@ def test_get_hyperparameter_search_space(self): self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() hyperparameters = cs.get_hyperparameters() - self.assertEqual(143, len(hyperparameters)) + self.assertEqual(140, len(hyperparameters)) self.assertEqual(len(hyperparameters) - 6, len(conditions)) def test_get_hyperparameter_search_space_include_exclude_models(self): @@ -338,7 +338,7 @@ def test_get_hyperparameter_search_space_only_forbidden_combinations(self): "Cannot find a legal default configuration", SimpleRegressionPipeline, include={ - 'regressor': ['ridge_regression'], + 'regressor': ['extra_trees'], 'feature_preprocessor': ['densifier'] }, dataset_properties={'sparse': True} diff --git a/test/test_scripts/test_metadata_generation.py b/test/test_scripts/test_metadata_generation.py index 40ca4014b1..66a1ba727a 100644 --- a/test/test_scripts/test_metadata_generation.py +++ b/test/test_scripts/test_metadata_generation.py @@ -16,8 +16,13 @@ def setUp(self): self.working_directory = '/tmp/autosklearn-unittest-tmp-dir-%s-%d-%d' % ( socket.gethostname(), os.getpid(), random.randint(0, 1000000)) + def print_files(self): + print('Existing files:') + for dirpath, dirnames, filenames in os.walk(self.working_directory): + print(dirpath, dirnames, filenames) + @unittest.skipIf(sys.version_info < (3, 6), 'This test requires up-to-date python') - def test_metadata_generation(self): + def test_metadata_generation_classification(self): current_directory = __file__ scripts_directory = os.path.abspath(os.path.join(current_directory, '..', '..', '..', @@ -35,15 +40,12 @@ def test_metadata_generation(self): # 3. create configuration commands script_filename = os.path.join(scripts_directory, '01_create_commands.py') - cmd = 'python3 %s --working-directory %s --task-type %s' % ( - script_filename, self.working_directory, task_type) - rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + cmd = 'python3 %s --working-directory %s' % (script_filename, self.working_directory) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(rval.returncode, 0, msg=str(rval)) # 4. run one of the commands to get some data - commands_output_file = os.path.join(self.working_directory, - 'umd-cls.txt') + commands_output_file = os.path.join(self.working_directory, 'metadata_commands.txt') self.assertTrue(os.path.exists(commands_output_file)) with open(commands_output_file) as fh: @@ -65,18 +67,15 @@ def test_metadata_generation(self): print('STDOUT: %s' % repr(rval.stdout), flush=True) print('STDERR: %s' % repr(rval.stderr), flush=True) - # Print the files which are there - print('Existing files:') - for dirpath, dirnames, filenames in os.walk(self.working_directory): - print(dirpath, dirnames, filenames) + self.print_files() expected_output_directory = os.path.join(self.working_directory, 'configuration', - 'classification', + task_type, '75222') self.assertTrue(os.path.exists(expected_output_directory)) smac_log = os.path.join(self.working_directory, - 'configuration/classification/75222', + 'configuration', task_type, '75222', 'AutoML(1):75222.log') with open(smac_log) as fh: smac_output = fh.read() @@ -122,8 +121,10 @@ def test_metadata_generation(self): # 6. Calculate metafeatures script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py') - cmd = 'python3 %s --working-directory %s --task-type %s --test-mode ' \ - 'True' % (script_filename, self.working_directory, task_type) + cmd = ( + 'python3 %s --working-directory %s --task-type %s --test-mode ' + % (script_filename, self.working_directory, task_type) + ) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(rval.returncode, 0, msg=str(rval)) @@ -169,6 +170,157 @@ def test_metadata_generation(self): self.assertLess(algorithm_runs['data'][0][3], 0.9) self.assertEqual(algorithm_runs['data'][0][4], 'ok') + @unittest.skipIf(sys.version_info < (3, 6), 'This test requires up-to-date python') + def test_metadata_generation_regression(self): + current_directory = __file__ + scripts_directory = os.path.abspath(os.path.join(current_directory, + '..', '..', '..', + 'scripts')) + + # 1. create working directory + try: + os.makedirs(self.working_directory) + except Exception as e: + print(e) + + task_type = 'regression' + + # 2. should be done by the person running the unit tests! + + # 3. create configuration commands + script_filename = os.path.join(scripts_directory, '01_create_commands.py') + cmd = 'python3 %s --working-directory %s' % (script_filename, self.working_directory) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self.assertEqual(rval.returncode, 0, msg=str(rval)) + + # 4. run one of the commands to get some data + commands_output_file = os.path.join(self.working_directory, 'metadata_commands.txt') + self.assertTrue(os.path.exists(commands_output_file)) + + with open(commands_output_file) as fh: + while True: + cmd = fh.readline() + if 'task-id 5022' in cmd: + break + + self.assertIn('time-limit 86400', cmd) + self.assertIn('per-run-time-limit 1800', cmd) + cmd = cmd.replace('time-limit 86400', 'time-limit 60').replace( + 'per-run-time-limit 1800', 'per-run-time-limit 7') + # This tells the script to use the same memory limit for testing as + # for training. In production, it would use twice as much! + cmd = cmd.replace('-s 1', '-s 1 --unittest') + print('COMMAND: %s' % cmd) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + print('STDOUT: %s' % repr(rval.stdout), flush=True) + print('STDERR: %s' % repr(rval.stderr), flush=True) + + self.print_files() + + expected_output_directory = os.path.join(self.working_directory, + 'configuration', + task_type, + '5022') + self.assertTrue(os.path.exists(expected_output_directory)) + smac_log = os.path.join(self.working_directory, + 'configuration', task_type, '5022', + 'AutoML(1):5022.log') + with open(smac_log) as fh: + smac_output = fh.read() + self.assertEqual(rval.returncode, 0, msg=str(rval) + '\n' + smac_output) + expected_validation_output = os.path.join(expected_output_directory, + 'smac3-output', + 'run_1', + 'validation_trajectory.json') + self.assertTrue(os.path.exists(expected_validation_output)) + trajectory = os.path.join(expected_output_directory, + 'smac3-output', 'run_1', 'trajectory.json') + + with open(expected_validation_output) as fh_validation: + with open(trajectory) as fh_trajectory: + traj = json.load(fh_trajectory) + valid_traj = json.load(fh_validation) + print('Validation trajectory:') + print(valid_traj) + self.assertGreater(len(traj), 0) + self.assertEqual(len(traj), len(valid_traj)) + + # 5. Get the test performance of these configurations + script_filename = os.path.join(scripts_directory, '02_retrieve_metadata.py') + cmd = 'python3 %s --working-directory %s --task-type %s' % ( + script_filename, self.working_directory, task_type) + print('COMMAND: %s' % cmd) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + print('STDOUT: %s' % repr(rval.stdout), flush=True) + print('STDERR: %s' % repr(rval.stderr), flush=True) + self.assertEqual(rval.returncode, 0, msg=str(rval)) + + self.print_files() + + for file in ['algorithm_runs.arff', 'configurations.csv', + 'description.results.txt']: + for metric in ['r2', 'mean_squared_error']: + path = os.path.join( + self.working_directory, + 'configuration_results', + '%s_regression_dense' % metric, + file, + ) + self.assertTrue(os.path.exists(path), msg=path) + + # 6. Calculate metafeatures + script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py') + cmd = ( + 'python3 %s --working-directory %s --task-type %s --test-mode ' + % (script_filename, self.working_directory, task_type) + ) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.assertEqual(rval.returncode, 0, msg=str(rval)) + for file in ['calculation_times.csv', 'description.features.txt', + 'feature_costs.arff', 'feature_runstatus.arff', + 'feature_values.arff']: + self.assertTrue(os.path.exists(os.path.join(self.working_directory, + 'metafeatures', + file))) + + # 7. Create aslib files + script_filename = os.path.join(scripts_directory, '04_create_aslib_files.py') + cmd = 'python3 %s --working-directory %s --task-type %s ' % ( + script_filename, self.working_directory, task_type) + rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.assertEqual(rval.returncode, 0, msg=str(rval)) + + for file in ['algorithm_runs.arff', 'configurations.csv', + 'description.txt', 'feature_costs.arff', + 'feature_runstatus.arff', 'feature_values.arff', + 'readme.txt']: + self.assertTrue(os.path.exists(os.path.join(self.working_directory, + 'metadata', + 'r2_%s_dense' % task_type, + file))) + + with open(os.path.join(self.working_directory, + 'metadata', + 'r2_regression_dense', + 'algorithm_runs.arff')) as fh: + algorithm_runs = arff.load(fh) + self.assertEqual(algorithm_runs['attributes'], + [('instance_id', 'STRING'), + ('repetition', 'NUMERIC'), + ('algorithm', 'STRING'), + ('r2', 'NUMERIC'), + ('runstatus', + ['ok', 'timeout', 'memout', 'not_applicable', + 'crash', 'other'])]) + self.assertEqual(len(algorithm_runs['data']), 1) + self.assertEqual(len(algorithm_runs['data'][0]), 5) + self.assertLess(algorithm_runs['data'][0][3], 0.9) + self.assertEqual(algorithm_runs['data'][0][4], 'ok') + def tearDown(self): for i in range(5): try: From 8537d78e20264ad2a95eb2e7cf19d64519ccb7bc Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 16 Sep 2020 17:22:16 +0200 Subject: [PATCH 2/2] pep8, simplification, unit tests --- .../metalearning/metafeatures/metafeatures.py | 1 - .../regression/gradient_boosting.py | 6 +- scripts/02_retrieve_metadata.py | 97 ++--- scripts/03_calculate_metafeatures.py | 341 +++++++++--------- scripts/04_create_aslib_files.py | 232 ++++++------ test/test_pipeline/test_regression.py | 7 + test/test_scripts/test_metadata_generation.py | 22 +- 7 files changed, 355 insertions(+), 351 deletions(-) diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index d39c1c244d..7f49fe4998 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -8,7 +8,6 @@ # TODO use balanced accuracy! from sklearn.utils import check_array from sklearn.multiclass import OneVsRestClassifier -from sklearn.utils.multiclass import type_of_target from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor diff --git a/autosklearn/pipeline/components/regression/gradient_boosting.py b/autosklearn/pipeline/components/regression/gradient_boosting.py index a74dfd37cd..bc22df3eb2 100644 --- a/autosklearn/pipeline/components/regression/gradient_boosting.py +++ b/autosklearn/pipeline/components/regression/gradient_boosting.py @@ -114,8 +114,10 @@ def iterative_fit(self, X, y, n_iter=2, refit=False): self.estimator.fit(X, y) - if self.estimator.max_iter >= self.max_iter \ - or self.estimator.max_iter > self.estimator.n_iter_: + if ( + self.estimator.max_iter >= self.max_iter + or self.estimator.max_iter > self.estimator.n_iter_ + ): self.fully_fit_ = True return self diff --git a/scripts/02_retrieve_metadata.py b/scripts/02_retrieve_metadata.py index 766172b70a..a3dde751b1 100644 --- a/scripts/02_retrieve_metadata.py +++ b/scripts/02_retrieve_metadata.py @@ -27,7 +27,10 @@ def retrieve_matadata(validation_directory, metric, configuration_space, configurations = dict() configurations_to_ids = dict() - possible_experiment_directories = os.listdir(validation_directory) + try: + possible_experiment_directories = os.listdir(validation_directory) + except FileNotFoundError: + return {}, {} for ped in possible_experiment_directories: task_name = None @@ -154,63 +157,61 @@ def main(): parser = ArgumentParser() parser.add_argument("--working-directory", type=str, required=True) - parser.add_argument("--task-type", required=True, - choices=['classification', 'regression']) parser.add_argument("--cutoff", type=int, default=-1) parser.add_argument("--only-best", type=bool, default=True) args = parser.parse_args() working_directory = args.working_directory - task_type = args.task_type cutoff = args.cutoff only_best = args.only_best - if task_type == 'classification': - metadata_sets = itertools.product( - [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], - CLASSIFICATION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'classification') - elif task_type == 'regression': - metadata_sets = itertools.product( - [0, 1], [REGRESSION], REGRESSION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'regression') - else: - raise ValueError(task_type) - - output_dir = os.path.join(working_directory, 'configuration_results') - - for sparse, task, metric in metadata_sets: - print(TASK_TYPES_TO_STRING[task], metric, sparse) - - output_dir_ = os.path.join(output_dir, '%s_%s_%s' % ( - metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense')) - - configuration_space = pipeline.get_configuration_space( - {'is_sparse': sparse, 'task': task}) - - outputs, configurations = retrieve_matadata( - validation_directory=input_directory, - metric=metric, - cutoff=cutoff, - configuration_space=configuration_space, - only_best=only_best) - - if len(outputs) == 0: - print("No output found for %s, %s, %s" % - (metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense')) - continue + for task_type in ('classification', 'regression'): + if task_type == 'classification': + metadata_sets = itertools.product( + [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], + CLASSIFICATION_METRICS) + input_directory = os.path.join(working_directory, 'configuration', + 'classification') + elif task_type == 'regression': + metadata_sets = itertools.product( + [0, 1], [REGRESSION], REGRESSION_METRICS) + input_directory = os.path.join(working_directory, 'configuration', + 'regression') + else: + raise ValueError(task_type) + + output_dir = os.path.join(working_directory, 'configuration_results') + + for sparse, task, metric in metadata_sets: + print(TASK_TYPES_TO_STRING[task], metric, sparse) + + output_dir_ = os.path.join(output_dir, '%s_%s_%s' % ( + metric, TASK_TYPES_TO_STRING[task], + 'sparse' if sparse else 'dense')) + + configuration_space = pipeline.get_configuration_space( + {'is_sparse': sparse, 'task': task}) + + outputs, configurations = retrieve_matadata( + validation_directory=input_directory, + metric=metric, + cutoff=cutoff, + configuration_space=configuration_space, + only_best=only_best) + + if len(outputs) == 0: + print("No output found for %s, %s, %s" % + (metric, TASK_TYPES_TO_STRING[task], + 'sparse' if sparse else 'dense')) + continue - try: - os.makedirs(output_dir_) - except: - pass + try: + os.makedirs(output_dir_) + except: + pass - write_output(outputs, configurations, output_dir_, - configuration_space, metric) + write_output(outputs, configurations, output_dir_, + configuration_space, metric) if __name__ == "__main__": diff --git a/scripts/03_calculate_metafeatures.py b/scripts/03_calculate_metafeatures.py index 062c746b7d..150c4e2d7d 100644 --- a/scripts/03_calculate_metafeatures.py +++ b/scripts/03_calculate_metafeatures.py @@ -69,8 +69,6 @@ def calculate_metafeatures(task_id, exclude): if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--working-directory", type=str, required=True) - parser.add_argument("--task-type", required=True, - choices=['classification', 'regression']) parser.add_argument("--memory-limit", type=int, default=3072) parser.add_argument("--n-jobs", help="Compute metafeatures in parallel if possible.", @@ -79,7 +77,6 @@ def calculate_metafeatures(task_id, exclude): args = parser.parse_args() working_directory = args.working_directory - task_type = args.task_type memory_limit = args.memory_limit n_jobs = args.n_jobs test_mode = args.test_mode @@ -92,172 +89,174 @@ def calculate_metafeatures(task_id, exclude): all_metafeatures = {} - if task_type == 'classification': - tasks = classification_tasks - else: - tasks = regression_tasks - - if test_mode: - tasks = [tasks[0]] - - EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ - if task_type == 'classification' else EXCLUDE_META_FEATURES_REGRESSION - - tasks = copy.deepcopy(tasks) - np.random.shuffle(tasks) - - def producer(): - for task_id in tasks: - yield task_id - - memory = joblib.Memory(location='/tmp/joblib', verbose=10) - cached_calculate_metafeatures = memory.cache(calculate_metafeatures) - mfs = joblib.Parallel(n_jobs=args.n_jobs) \ - (joblib.delayed(cached_calculate_metafeatures)(task_id, EXCLUDE_META_FEATURES) - for task_id in producer()) - - for mf in mfs: - if mf is not None: - all_metafeatures[mf.dataset_name] = mf - - # Write the calculation times as a csv file to disc (can be viewed in - # LibreOffice calc afterwards) - calculation_times = defaultdict(dict) - metafeature_values = defaultdict(dict) - helperfunction_values = defaultdict(dict) - - for i, task_id in enumerate(all_metafeatures): - calculation_times[task_id] = dict() - for metafeature_name in sorted( - all_metafeatures[task_id].metafeature_values): - metafeature_value = all_metafeatures[task_id].metafeature_values[ - metafeature_name] - calculation_times[task_id][metafeature_name] = \ - metafeature_value.time - if metafeature_value.type_ == "HELPERFUNCTION": - helperfunction_values[task_id][metafeature_name] = \ - metafeature_value.value - else: - metafeature_values[task_id][metafeature_name] = \ - metafeature_value.value - - calculation_times = pd.DataFrame(calculation_times).transpose() - with open(os.path.join(output_directory, "calculation_times.csv"), - "w") as fh: - fh.write(calculation_times.to_csv()) - - # Write all metafeatures in the aslib1.0 format - metafeature_values = pd.DataFrame(metafeature_values).transpose() - arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % name, 'NUMERIC') for name in - metafeature_values.columns] - arff_object['relation'] = "FEATURE_VALUES" - arff_object['description'] = "" - - data = [] - for idx in metafeature_values.index: - line = [idx, 1] - line += [value if np.isfinite(value) else None - for value in metafeature_values.loc[idx, :].values] - data.append(line) - arff_object['data'] = data - - with open(os.path.join(output_directory, "feature_values.arff"), - "w") as fh: - arff.dump(arff_object, fh) - - # Feature steps and runtimes according to the aslib1.0 format - feature_steps = defaultdict(list) - metafeature_names = list() - for metafeature_name in metafeatures.metafeatures.functions: - - if metafeature_name in EXCLUDE_META_FEATURES: - continue - - dependency = metafeatures.metafeatures.get_dependency(metafeature_name) - if dependency is not None: - feature_steps[dependency].append(metafeature_name) - feature_steps[metafeature_name].append(metafeature_name) - - metafeature_names.append(metafeature_name) - - # Write the feature runstatus in the aslib1.0 format - arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % name, - ['ok', 'timeout', 'memout', 'presolved', - 'crash', 'other']) - for name in feature_steps] - arff_object['relation'] = "FEATURE_RUNSTATUS" - arff_object['description'] = "" - - data = [] - for idx in metafeature_values.index: - line = [idx, 1] - for feature_step in feature_steps: - if feature_step in helperfunction_values[idx]: - line.append('ok' if helperfunction_values[feature_step] is not \ - None else 'other') - elif feature_step in metafeature_values.loc[idx]: - line.append('ok' if np.isfinite(metafeature_values.loc[idx][ - feature_step]) else 'other') - else: - line.append('other') - - data.append(line) - arff_object['data'] = data - - with open(os.path.join(output_directory, "feature_runstatus.arff"), - "w") as fh: - arff.dump(arff_object, fh) - - arff_object = dict() - arff_object['attributes'] = [('instance_id', 'STRING'), - ('repetition', 'NUMERIC')] + \ - [('%s' % feature_step, 'NUMERIC') for - feature_step in feature_steps] - arff_object['relation'] = "FEATURE_COSTS" - arff_object['description'] = "" - - data = [] - for instance_id in calculation_times.index: - calculation_times_per_group = dict() - line = [instance_id, 1] + for task_type in ('classification', 'regression'): + + if task_type == 'classification': + tasks = classification_tasks + else: + tasks = regression_tasks + + if test_mode: + tasks = [tasks[0]] + + EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ + if task_type == 'classification' else EXCLUDE_META_FEATURES_REGRESSION + + tasks = copy.deepcopy(tasks) + np.random.shuffle(tasks) + + def producer(): + for task_id in tasks: + yield task_id + + memory = joblib.Memory(location='/tmp/joblib', verbose=10) + cached_calculate_metafeatures = memory.cache(calculate_metafeatures) + mfs = joblib.Parallel(n_jobs=args.n_jobs) \ + (joblib.delayed(cached_calculate_metafeatures)(task_id, EXCLUDE_META_FEATURES) + for task_id in producer()) + + for mf in mfs: + if mf is not None: + all_metafeatures[mf.dataset_name] = mf + + # Write the calculation times as a csv file to disc (can be viewed in + # LibreOffice calc afterwards) + calculation_times = defaultdict(dict) + metafeature_values = defaultdict(dict) + helperfunction_values = defaultdict(dict) + + for i, task_id in enumerate(all_metafeatures): + calculation_times[task_id] = dict() + for metafeature_name in sorted( + all_metafeatures[task_id].metafeature_values): + metafeature_value = all_metafeatures[task_id].metafeature_values[ + metafeature_name] + calculation_times[task_id][metafeature_name] = \ + metafeature_value.time + if metafeature_value.type_ == "HELPERFUNCTION": + helperfunction_values[task_id][metafeature_name] = \ + metafeature_value.value + else: + metafeature_values[task_id][metafeature_name] = \ + metafeature_value.value + + calculation_times = pd.DataFrame(calculation_times).transpose() + with open(os.path.join(output_directory, "calculation_times.csv"), + "w") as fh: + fh.write(calculation_times.to_csv()) + + # Write all metafeatures in the aslib1.0 format + metafeature_values = pd.DataFrame(metafeature_values).transpose() + arff_object = dict() + arff_object['attributes'] = [('instance_id', 'STRING'), + ('repetition', 'NUMERIC')] + \ + [('%s' % name, 'NUMERIC') for name in + metafeature_values.columns] + arff_object['relation'] = "FEATURE_VALUES" + arff_object['description'] = "" + + data = [] + for idx in metafeature_values.index: + line = [idx, 1] + line += [value if np.isfinite(value) else None + for value in metafeature_values.loc[idx, :].values] + data.append(line) + arff_object['data'] = data + + with open(os.path.join(output_directory, "feature_values.arff"), + "w") as fh: + arff.dump(arff_object, fh) + + # Feature steps and runtimes according to the aslib1.0 format + feature_steps = defaultdict(list) + metafeature_names = list() + for metafeature_name in metafeatures.metafeatures.functions: + + if metafeature_name in EXCLUDE_META_FEATURES: + continue + + dependency = metafeatures.metafeatures.get_dependency(metafeature_name) + if dependency is not None: + feature_steps[dependency].append(metafeature_name) + feature_steps[metafeature_name].append(metafeature_name) + + metafeature_names.append(metafeature_name) + + # Write the feature runstatus in the aslib1.0 format + arff_object = dict() + arff_object['attributes'] = [('instance_id', 'STRING'), + ('repetition', 'NUMERIC')] + \ + [('%s' % name, + ['ok', 'timeout', 'memout', 'presolved', + 'crash', 'other']) + for name in feature_steps] + arff_object['relation'] = "FEATURE_RUNSTATUS" + arff_object['description'] = "" + + data = [] + for idx in metafeature_values.index: + line = [idx, 1] + for feature_step in feature_steps: + if feature_step in helperfunction_values[idx]: + line.append('ok' if helperfunction_values[feature_step] is not \ + None else 'other') + elif feature_step in metafeature_values.loc[idx]: + line.append('ok' if np.isfinite(metafeature_values.loc[idx][ + feature_step]) else 'other') + else: + line.append('other') + + data.append(line) + arff_object['data'] = data + + with open(os.path.join(output_directory, "feature_runstatus.arff"), + "w") as fh: + arff.dump(arff_object, fh) + + arff_object = dict() + arff_object['attributes'] = [('instance_id', 'STRING'), + ('repetition', 'NUMERIC')] + \ + [('%s' % feature_step, 'NUMERIC') for + feature_step in feature_steps] + arff_object['relation'] = "FEATURE_COSTS" + arff_object['description'] = "" + + data = [] + for instance_id in calculation_times.index: + calculation_times_per_group = dict() + line = [instance_id, 1] + for feature_step in feature_steps: + time_ = 0.0 + for feature in feature_steps[feature_step]: + time_ += calculation_times[feature][instance_id] + if not np.isfinite(time_): + raise ValueError("Feature cost %s for instance %s and feature " + "step %s not finite" % (time_, instance_id, feature)) + line.append(time_) + data.append(line) + arff_object['data'] = data + + with open(os.path.join(output_directory, "feature_costs.arff"), + "w") as fh: + arff.dump(arff_object, fh) + + # Write the features part of the description.txt to a file + description = OrderedDict() + description['features_cutoff_time'] = '3600' + description['features_cutoff_memory'] = args.memory_limit + description['number_of_feature_steps'] = str(len(feature_steps)) + for feature_step in feature_steps: - time_ = 0.0 - for feature in feature_steps[feature_step]: - time_ += calculation_times[feature][instance_id] - if not np.isfinite(time_): - raise ValueError("Feature cost %s for instance %s and feature " - "step %s not finite" % (time_, instance_id, feature)) - line.append(time_) - data.append(line) - arff_object['data'] = data - - with open(os.path.join(output_directory, "feature_costs.arff"), - "w") as fh: - arff.dump(arff_object, fh) - - # Write the features part of the description.txt to a file - description = OrderedDict() - description['features_cutoff_time'] = '3600' - description['features_cutoff_memory'] = args.memory_limit - description['number_of_feature_steps'] = str(len(feature_steps)) - - for feature_step in feature_steps: - description['feature_step %s' % feature_step] = \ - ", ".join(feature_steps[feature_step]) - description['features_deterministic'] = ", ".join([ - metafeature_name for - metafeature_name in - metafeature_names]) - description['features_stochastic'] = '' - description['default_steps'] = ", ".join(feature_steps) - - with open(os.path.join(output_directory, - "description.features.txt"), "w") as fh: - for task_id in description: - fh.write("%s: %s\n" % (task_id, description[task_id])) + description['feature_step %s' % feature_step] = \ + ", ".join(feature_steps[feature_step]) + description['features_deterministic'] = ", ".join([ + metafeature_name for + metafeature_name in + metafeature_names]) + description['features_stochastic'] = '' + description['default_steps'] = ", ".join(feature_steps) + + with open(os.path.join(output_directory, + "description.features.txt"), "w") as fh: + for task_id in description: + fh.write("%s: %s\n" % (task_id, description[task_id])) diff --git a/scripts/04_create_aslib_files.py b/scripts/04_create_aslib_files.py index c46e88f829..bd929bdd30 100644 --- a/scripts/04_create_aslib_files.py +++ b/scripts/04_create_aslib_files.py @@ -10,15 +10,12 @@ if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--working-directory", type=str, required=True) - parser.add_argument("--task-type", required=True, - choices=['classification', 'regression']) parser.add_argument("--scenario_id", type=str, default='auto-sklearn') parser.add_argument("--algorithm_cutoff_time", type=int, default=1800) parser.add_argument("--algorithm_cutoff_memory", type=int, default=3072) args = parser.parse_args() working_directory = args.working_directory - task_type = args.task_type output_dir = os.path.join(working_directory, 'metadata') results_dir = os.path.join(working_directory, 'configuration_results') @@ -34,117 +31,118 @@ except (OSError, IOError): pass - if task_type == 'classification': - metadata_sets = itertools.product( - [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], - CLASSIFICATION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'classification') - elif task_type == 'regression': - metadata_sets = itertools.product( - [0, 1], [REGRESSION], REGRESSION_METRICS) - input_directory = os.path.join(working_directory, 'configuration', - 'regression') - else: - raise ValueError(task_type) - - for sparse, task, metric in metadata_sets: - print(TASK_TYPES_TO_STRING[task], metric, sparse) - - dir_name = '%s_%s_%s' % (metric, TASK_TYPES_TO_STRING[task], - 'sparse' if sparse else 'dense') - output_dir_ = os.path.join(output_dir, dir_name) - results_dir_ = os.path.join(results_dir, dir_name) - - if not os.path.exists(results_dir_): - print("Results directory %s does not exist!" % results_dir_) - continue - - try: - os.makedirs(output_dir_) - except Exception: - pass - - # Create a readme.txt - with open(os.path.join(output_dir_, "readme.txt"), "w") as fh: - pass - - # Create description.txt - with open(os.path.join(metafeatures_dir, - "description.features.txt")) as fh: - description_metafeatures = fh.read() - - with open(os.path.join(results_dir_, - "description.results.txt")) as fh: - description_results = fh.read() - - description = [description_metafeatures, description_results] - description.append("scenario_id: %s" % scenario_id) - description.append("maximize: false") - description.append( - "algorithm_cutoff_time: %d" % algorithm_cutoff_time) - description.append( - "algorithm_cutoff_memory: %d" % algorithm_cutoff_memory) - - with open(os.path.join(output_dir_, "description.txt"), "w") as fh: - for line in description: - fh.write(line) - fh.write("\n") - - # Copy feature values and add instance id - with open(os.path.join(metafeatures_dir, - "feature_values.arff")) as fh: - feature_values = arff.load(fh) - - feature_values['relation'] = scenario_id + "_" + feature_values[ - 'relation'] - - with open(os.path.join(output_dir_, "feature_values.arff"), - "w") as fh: - arff.dump(feature_values, fh) - - # Copy feature runstatus and add instance id - with open(os.path.join(metafeatures_dir, - "feature_runstatus.arff")) as fh: - feature_runstatus = arff.load(fh) - - feature_runstatus['relation'] = scenario_id + "_" + \ - feature_runstatus['relation'] - - with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \ - as fh: - arff.dump(feature_runstatus, fh) - - # Copy feature runstatus and add instance id - with open( - os.path.join(metafeatures_dir, "feature_costs.arff")) as fh: - feature_costs = arff.load(fh) - - feature_costs['relation'] = scenario_id + "_" + feature_costs[ - 'relation'] - for i in range(len(feature_costs['data'])): - for j in range(2, len(feature_costs['data'][i])): - feature_costs['data'][i][j] = \ - round(feature_costs['data'][i][j], 5) - - with open(os.path.join(output_dir_, "feature_costs.arff"), "w") \ - as fh: - arff.dump(feature_costs, fh) - - # Copy algorithm runs and add instance id - with open(os.path.join(results_dir_, "algorithm_runs.arff")) as fh: - algorithm_runs = arff.load(fh) - - algorithm_runs['relation'] = scenario_id + "_" + algorithm_runs[ - 'relation'] - - with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") \ - as fh: - arff.dump(algorithm_runs, fh) - - # Copy configurations file - with open(os.path.join(results_dir_, "configurations.csv")) as fh: - algorithm_runs = fh.read() - with open(os.path.join(output_dir_, "configurations.csv"), "w") \ - as fh: - fh.write(algorithm_runs) + for task_type in ('classification', 'regression'): + if task_type == 'classification': + metadata_sets = itertools.product( + [0, 1], [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION], + CLASSIFICATION_METRICS) + input_directory = os.path.join(working_directory, 'configuration', + 'classification') + elif task_type == 'regression': + metadata_sets = itertools.product( + [0, 1], [REGRESSION], REGRESSION_METRICS) + input_directory = os.path.join(working_directory, 'configuration', + 'regression') + else: + raise ValueError(task_type) + + for sparse, task, metric in metadata_sets: + print(TASK_TYPES_TO_STRING[task], metric, sparse) + + dir_name = '%s_%s_%s' % (metric, TASK_TYPES_TO_STRING[task], + 'sparse' if sparse else 'dense') + output_dir_ = os.path.join(output_dir, dir_name) + results_dir_ = os.path.join(results_dir, dir_name) + + if not os.path.exists(results_dir_): + print("Results directory %s does not exist!" % results_dir_) + continue + + try: + os.makedirs(output_dir_) + except Exception: + pass + + # Create a readme.txt + with open(os.path.join(output_dir_, "readme.txt"), "w") as fh: + pass + + # Create description.txt + with open(os.path.join(metafeatures_dir, + "description.features.txt")) as fh: + description_metafeatures = fh.read() + + with open(os.path.join(results_dir_, + "description.results.txt")) as fh: + description_results = fh.read() + + description = [description_metafeatures, description_results] + description.append("scenario_id: %s" % scenario_id) + description.append("maximize: false") + description.append( + "algorithm_cutoff_time: %d" % algorithm_cutoff_time) + description.append( + "algorithm_cutoff_memory: %d" % algorithm_cutoff_memory) + + with open(os.path.join(output_dir_, "description.txt"), "w") as fh: + for line in description: + fh.write(line) + fh.write("\n") + + # Copy feature values and add instance id + with open(os.path.join(metafeatures_dir, + "feature_values.arff")) as fh: + feature_values = arff.load(fh) + + feature_values['relation'] = scenario_id + "_" + feature_values[ + 'relation'] + + with open(os.path.join(output_dir_, "feature_values.arff"), + "w") as fh: + arff.dump(feature_values, fh) + + # Copy feature runstatus and add instance id + with open(os.path.join(metafeatures_dir, + "feature_runstatus.arff")) as fh: + feature_runstatus = arff.load(fh) + + feature_runstatus['relation'] = scenario_id + "_" + \ + feature_runstatus['relation'] + + with open(os.path.join(output_dir_, "feature_runstatus.arff"), "w") \ + as fh: + arff.dump(feature_runstatus, fh) + + # Copy feature runstatus and add instance id + with open( + os.path.join(metafeatures_dir, "feature_costs.arff")) as fh: + feature_costs = arff.load(fh) + + feature_costs['relation'] = scenario_id + "_" + feature_costs[ + 'relation'] + for i in range(len(feature_costs['data'])): + for j in range(2, len(feature_costs['data'][i])): + feature_costs['data'][i][j] = \ + round(feature_costs['data'][i][j], 5) + + with open(os.path.join(output_dir_, "feature_costs.arff"), "w") \ + as fh: + arff.dump(feature_costs, fh) + + # Copy algorithm runs and add instance id + with open(os.path.join(results_dir_, "algorithm_runs.arff")) as fh: + algorithm_runs = arff.load(fh) + + algorithm_runs['relation'] = scenario_id + "_" + algorithm_runs[ + 'relation'] + + with open(os.path.join(output_dir_, "algorithm_runs.arff"), "w") \ + as fh: + arff.dump(algorithm_runs, fh) + + # Copy configurations file + with open(os.path.join(results_dir_, "configurations.csv")) as fh: + algorithm_runs = fh.read() + with open(os.path.join(output_dir_, "configurations.csv"), "w") \ + as fh: + fh.write(algorithm_runs) diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index de730bc604..6d5a65d7e4 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -234,6 +234,13 @@ def _test_configurations(self, configurations_space, make_sparse=False, print(config) traceback.print_tb(sys.exc_info()[2]) raise e + except UnboundLocalError as e: + if "local variable 'raw_predictions_val' referenced before assignment" in e.args[0]: + continue + else: + print(traceback.format_exc()) + print(config) + raise e except Exception as e: if "Multiple input features cannot have the same target value" in e.args[0]: continue diff --git a/test/test_scripts/test_metadata_generation.py b/test/test_scripts/test_metadata_generation.py index 66a1ba727a..78ca221e16 100644 --- a/test/test_scripts/test_metadata_generation.py +++ b/test/test_scripts/test_metadata_generation.py @@ -99,8 +99,7 @@ def test_metadata_generation_classification(self): # 5. Get the test performance of these configurations script_filename = os.path.join(scripts_directory, '02_retrieve_metadata.py') - cmd = 'python3 %s --working-directory %s --task-type %s' % ( - script_filename, self.working_directory, task_type) + cmd = 'python3 %s --working-directory %s ' % (script_filename, self.working_directory) print('COMMAND: %s' % cmd) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -122,8 +121,8 @@ def test_metadata_generation_classification(self): # 6. Calculate metafeatures script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py') cmd = ( - 'python3 %s --working-directory %s --task-type %s --test-mode ' - % (script_filename, self.working_directory, task_type) + 'python3 %s --working-directory %s --test-mode ' + % (script_filename, self.working_directory) ) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -137,8 +136,8 @@ def test_metadata_generation_classification(self): # 7. Create aslib files script_filename = os.path.join(scripts_directory, '04_create_aslib_files.py') - cmd = 'python3 %s --working-directory %s --task-type %s ' % ( - script_filename, self.working_directory, task_type) + cmd = 'python3 %s --working-directory %s ' % ( + script_filename, self.working_directory) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(rval.returncode, 0, msg=str(rval)) @@ -248,8 +247,7 @@ def test_metadata_generation_regression(self): # 5. Get the test performance of these configurations script_filename = os.path.join(scripts_directory, '02_retrieve_metadata.py') - cmd = 'python3 %s --working-directory %s --task-type %s' % ( - script_filename, self.working_directory, task_type) + cmd = 'python3 %s --working-directory %s ' % (script_filename, self.working_directory) print('COMMAND: %s' % cmd) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -273,8 +271,8 @@ def test_metadata_generation_regression(self): # 6. Calculate metafeatures script_filename = os.path.join(scripts_directory, '03_calculate_metafeatures.py') cmd = ( - 'python3 %s --working-directory %s --task-type %s --test-mode ' - % (script_filename, self.working_directory, task_type) + 'python3 %s --working-directory %s --test-mode ' + % (script_filename, self.working_directory) ) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -288,8 +286,8 @@ def test_metadata_generation_regression(self): # 7. Create aslib files script_filename = os.path.join(scripts_directory, '04_create_aslib_files.py') - cmd = 'python3 %s --working-directory %s --task-type %s ' % ( - script_filename, self.working_directory, task_type) + cmd = 'python3 %s --working-directory %s ' % ( + script_filename, self.working_directory) rval = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.assertEqual(rval.returncode, 0, msg=str(rval))