From 50735638a3d74289d3f5bfdccea033101d729cf6 Mon Sep 17 00:00:00 2001 From: Vlad Skripniuk Date: Mon, 3 Jun 2019 22:23:37 +0200 Subject: [PATCH 1/8] Add SelectRates for regression [fix: #375] --- .../feature_preprocessing/select_rates.py | 56 ++++++++++++++----- autosklearn/pipeline/util.py | 19 +++++-- .../test_select_rates.py | 40 +++++++++++++ 3 files changed, 98 insertions(+), 17 deletions(-) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py index 7406244ea9..14cdaf47e0 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py @@ -9,19 +9,29 @@ class SelectRates(AutoSklearnPreprocessingAlgorithm): def __init__(self, alpha, mode='fpr', - score_func="chi2", random_state=None): + score_func="chi2", task="classification", random_state=None): import sklearn.feature_selection self.random_state = random_state # We don't use this self.alpha = alpha + self.task = task - if score_func == "chi2": + if score_func == "chi2" and task == "classification": self.score_func = sklearn.feature_selection.chi2 - elif score_func == "f_classif": + elif score_func == "f_classif" and task == "classification": self.score_func = sklearn.feature_selection.f_classif + elif score_func == "mutual_info_classif" and task == "classification": + self.score_func = sklearn.feature_selection.mutual_info_classif + elif score_func == "f_regression" and task == "regression": + self.score_func = sklearn.feature_selection.f_regression + elif score_func == "mutual_info_regression" and task == "regression": + self.score_func = sklearn.feature_selection.mutual_info_regression else: - raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info'), " - "but is: %s" % score_func) + raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " + "for task='classification', " + "or in ('f_regression, 'mutual_info_regression') " + "for task='regression', " + "but is: %s for task='%s'" % (score_func, task)) self.mode = mode @@ -83,11 +93,19 @@ def get_properties(dataset_properties=None): if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA + if dataset_properties is not None and \ + 'target_type' in dataset_properties and \ + dataset_properties['target_type'] == 'regression': + + task = 'regression' + else: + task = 'classification' + return {'shortname': 'SR', 'name': 'Univariate Feature Selection based on rates', - 'handles_regression': False, - 'handles_classification': True, - 'handles_multiclass': True, + 'handles_regression': (task == 'regression'), + 'handles_classification': (task == 'classification'), + 'handles_multiclass': (task == 'classification'), 'handles_multilabel': False, 'handles_multioutput': False, 'is_deterministic': True, @@ -99,13 +117,25 @@ def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1) - score_func = CategoricalHyperparameter( - name="score_func", - choices=["chi2", "f_classif"], - default_value="chi2") + if dataset_properties is not None and \ + 'target_type' in dataset_properties and \ + dataset_properties['target_type'] == 'regression': + + score_func = Constant( + name="score_func", value="f_regression") + else: + score_func = CategoricalHyperparameter( + name="score_func", + choices=["chi2", "f_classif"], + default_value="chi2") + if dataset_properties is not None: # Chi2 can handle sparse data, so we respect this - if 'sparse' in dataset_properties and dataset_properties['sparse']: + if 'sparse' in dataset_properties and \ + dataset_properties['sparse'] and \ + ('target_type' not in dataset_properties or \ + dataset_properties['target_type'] == 'classification'): + score_func = Constant( name="score_func", value="chi2") diff --git a/autosklearn/pipeline/util.py b/autosklearn/pipeline/util.py index a2dfaa1a6c..e2197c423b 100644 --- a/autosklearn/pipeline/util.py +++ b/autosklearn/pipeline/util.py @@ -168,18 +168,29 @@ def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, return predictions, Y_test -def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, +def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, task=None, train_size_maximum=150): X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=make_sparse, train_size_maximum=train_size_maximum) original_X_train = X_train.copy() - configuration_space = Preprocessor.get_hyperparameter_search_space() + + if task is not None: + dataset_properties = {'target_type': task} + else: + dataset_properties = None + + configuration_space = Preprocessor.get_hyperparameter_search_space(dataset_properties) default = configuration_space.get_default_configuration() + kwargs = {hp_name: default[hp_name] for hp_name in + default if default[hp_name] is not None} + + if task is not None: + kwargs['task'] = task + preprocessor = Preprocessor(random_state=np.random.RandomState(1), - **{hp_name: default[hp_name] for hp_name in - default if default[hp_name] is not None}) + **kwargs) transformer = preprocessor.fit(X_train, Y_train) return transformer.transform(X_train), original_X_train diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates.py index 16e586e7e2..aa284e189c 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates.py @@ -95,3 +95,43 @@ def test_preprocessing_dtype(self): preprocessor.fit(X_train, Y_train) Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) + + + def test_default_configuration_regression(self): + transformation, original = \ + _test_preprocessing(SelectRates, task='regression') + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], 4) + self.assertFalse((transformation == 0).all()) + + + def test_preprocessing_dtype_regression(self): + # Dense + # np.float32 + X_train, Y_train, X_test, Y_test = get_dataset("iris") + self.assertEqual(X_train.dtype, np.float32) + + dataset_properties = {'target_type': 'regression'} + + configuration_space = \ + SelectRates.get_hyperparameter_search_space(dataset_properties) + default = configuration_space.get_default_configuration() + preprocessor = SelectRates(random_state=1, task='regression', + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float32) + + # np.float64 + X_train, Y_train, X_test, Y_test = get_dataset("iris") + X_train = X_train.astype(np.float64) + configuration_space = \ + SelectRates.get_hyperparameter_search_space(dataset_properties) + default = configuration_space.get_default_configuration() + preprocessor = SelectRates(random_state=1, task='regression', + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float64) From 045fdd0f62bad69e54b918fa9f78a18586a32c81 Mon Sep 17 00:00:00 2001 From: chico Date: Thu, 13 Aug 2020 20:08:40 +0200 Subject: [PATCH 2/8] Updated via rebase and moved to no-task approach --- .../feature_preprocessing/select_rates.py | 106 ++++++++++-------- autosklearn/pipeline/util.py | 9 +- .../feature_preprocessing/test_choice.py | 2 +- .../test_select_rates.py | 30 ++--- test/test_pipeline/test_classification.py | 13 +++ test/test_pipeline/test_regression.py | 16 ++- 6 files changed, 111 insertions(+), 65 deletions(-) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py index 14cdaf47e0..2f53e5d7b7 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py @@ -1,6 +1,6 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ - CategoricalHyperparameter, Constant + CategoricalHyperparameter from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm @@ -8,32 +8,38 @@ class SelectRates(AutoSklearnPreprocessingAlgorithm): - def __init__(self, alpha, mode='fpr', - score_func="chi2", task="classification", random_state=None): + def __init__(self, alpha, mode='percentile', + score_func="chi2", random_state=None): import sklearn.feature_selection self.random_state = random_state # We don't use this self.alpha = alpha - self.task = task + self.mode = mode - if score_func == "chi2" and task == "classification": + if score_func == "chi2": self.score_func = sklearn.feature_selection.chi2 - elif score_func == "f_classif" and task == "classification": + elif score_func == "f_classif": self.score_func = sklearn.feature_selection.f_classif - elif score_func == "mutual_info_classif" and task == "classification": + elif score_func == "mutual_info_classif": self.score_func = sklearn.feature_selection.mutual_info_classif - elif score_func == "f_regression" and task == "regression": + # Work Around as SMAC does not handle Not Equal + # Mutual info needs scikit learn default to prevent + # running into p_values problem (no pvalue found) + self.mode = 'percentile' + elif score_func == "f_regression": self.score_func = sklearn.feature_selection.f_regression - elif score_func == "mutual_info_regression" and task == "regression": + elif score_func == "mutual_info_regression": self.score_func = sklearn.feature_selection.mutual_info_regression + # Work Around as SMAC does not handle Not Equal + # Mutual info needs scikit learn default to prevent + # running into p_values problem (no pvalue found) + self.mode = 'percentile' else: raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " - "for task='classification', " + "for classification " "or in ('f_regression, 'mutual_info_regression') " - "for task='regression', " - "but is: %s for task='%s'" % (score_func, task)) - - self.mode = mode + "for task=regression " + "but is: %s " % (score_func)) def fit(self, X, y): import scipy.sparse @@ -93,19 +99,14 @@ def get_properties(dataset_properties=None): if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - if dataset_properties is not None and \ - 'target_type' in dataset_properties and \ - dataset_properties['target_type'] == 'regression': - - task = 'regression' - else: - task = 'classification' - + # This component handles regression and classification. + # It does so by building a hyperparameter search space according + # to the dataset properties return {'shortname': 'SR', 'name': 'Univariate Feature Selection based on rates', - 'handles_regression': (task == 'regression'), - 'handles_classification': (task == 'classification'), - 'handles_multiclass': (task == 'classification'), + 'handles_regression': True, + 'handles_classification': True, + 'handles_multiclass': True, 'handles_multilabel': False, 'handles_multioutput': False, 'is_deterministic': True, @@ -117,27 +118,32 @@ def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1) - if dataset_properties is not None and \ - 'target_type' in dataset_properties and \ - dataset_properties['target_type'] == 'regression': - - score_func = Constant( - name="score_func", value="f_regression") - else: - score_func = CategoricalHyperparameter( - name="score_func", - choices=["chi2", "f_classif"], - default_value="chi2") - + target_type = 'classification' if dataset_properties is not None: - # Chi2 can handle sparse data, so we respect this - if 'sparse' in dataset_properties and \ - dataset_properties['sparse'] and \ - ('target_type' not in dataset_properties or \ - dataset_properties['target_type'] == 'classification'): + # Whether or not this component supports regression, depends + # on the dataset properties. If the dataset properties is for a regression + # task (target_type==regression) we can build the configuration space + # accordingly + if dataset_properties.get('target_type') is not None: + target_type = dataset_properties.get('target_type') + + if target_type == 'regression': + if dataset_properties is not None and 'sparse' in dataset_properties \ + and dataset_properties['sparse']: + choices = ['mutual_info_regression'] + else: + choices = ['f_regression', 'mutual_info_regression'] + else: + if dataset_properties is not None and 'sparse' in dataset_properties \ + and dataset_properties['sparse']: + choices = ['chi2', 'mutual_info_classif'] + else: + choices = ['chi2', 'f_classif', 'mutual_info_classif'] - score_func = Constant( - name="score_func", value="chi2") + score_func = CategoricalHyperparameter( + name="score_func", + choices=choices, + default_value="chi2" if 'chi2' in choices else choices[0]) mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') @@ -146,4 +152,16 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) + # In case of mutual info regression, the mode needs to be percentile + # Which is the scikit learn default, else we run into p_values problem + # SMAC Cannot handle OR, so leave this code here for the future. + # Right now, we will have mode in the config space when we + # have mutual_info, yet it is not needed + # if 'mutual_info_regression' in choices: + # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') + # cs.add_condition(cond) + # if 'mutual_info_classif' in choices: + # cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') + # cs.add_condition(cond) + return cs diff --git a/autosklearn/pipeline/util.py b/autosklearn/pipeline/util.py index e2197c423b..6e1f3aaf2d 100644 --- a/autosklearn/pipeline/util.py +++ b/autosklearn/pipeline/util.py @@ -168,8 +168,8 @@ def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, return predictions, Y_test -def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, task=None, - train_size_maximum=150): +def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, + train_size_maximum=150, task=None): X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=make_sparse, train_size_maximum=train_size_maximum) @@ -184,10 +184,7 @@ def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, task=No default = configuration_space.get_default_configuration() kwargs = {hp_name: default[hp_name] for hp_name in - default if default[hp_name] is not None} - - if task is not None: - kwargs['task'] = task + default if default[hp_name] is not None} preprocessor = Preprocessor(random_state=np.random.RandomState(1), **kwargs) diff --git a/test/test_pipeline/components/feature_preprocessing/test_choice.py b/test/test_pipeline/components/feature_preprocessing/test_choice.py index 838cb5c3d8..525ec38356 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_choice.py +++ b/test/test_pipeline/components/feature_preprocessing/test_choice.py @@ -7,7 +7,7 @@ class FeatureProcessingTest(unittest.TestCase): def test_get_available_components(self): # Target type for target_type, num_values in [('classification', 15), - ('regression', 13)]: + ('regression', 14)]: data_properties = {'target_type': target_type} available_components = fp.FeaturePreprocessorChoice(data_properties)\ diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates.py index aa284e189c..204445385c 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates.py @@ -96,27 +96,30 @@ def test_preprocessing_dtype(self): Xt = preprocessor.transform(X_train) self.assertEqual(Xt.dtype, np.float64) - def test_default_configuration_regression(self): - transformation, original = \ - _test_preprocessing(SelectRates, task='regression') + transformation, original = _test_preprocessing( + SelectRates, + dataset='boston', + task='regression', + ) self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], 4) + # From 13 to 12 features + self.assertEqual(transformation.shape[1], 12) self.assertFalse((transformation == 0).all()) - def test_preprocessing_dtype_regression(self): # Dense # np.float32 - X_train, Y_train, X_test, Y_test = get_dataset("iris") + X_train, Y_train, X_test, Y_test = get_dataset("boston") self.assertEqual(X_train.dtype, np.float32) dataset_properties = {'target_type': 'regression'} - configuration_space = \ - SelectRates.get_hyperparameter_search_space(dataset_properties) + configuration_space = SelectRates.get_hyperparameter_search_space( + dataset_properties + ) default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, task='regression', + preprocessor = SelectRates(random_state=1, **{hp_name: default[hp_name] for hp_name in default}) preprocessor.fit(X_train, Y_train) @@ -124,12 +127,13 @@ def test_preprocessing_dtype_regression(self): self.assertEqual(Xt.dtype, np.float32) # np.float64 - X_train, Y_train, X_test, Y_test = get_dataset("iris") + X_train, Y_train, X_test, Y_test = get_dataset("boston") X_train = X_train.astype(np.float64) - configuration_space = \ - SelectRates.get_hyperparameter_search_space(dataset_properties) + configuration_space = SelectRates.get_hyperparameter_search_space( + dataset_properties + ) default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, task='regression', + preprocessor = SelectRates(random_state=1, **{hp_name: default[hp_name] for hp_name in default}) preprocessor.fit(X_train, Y_train) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 212971f0c9..dbf07563af 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -654,6 +654,19 @@ def test_pipeline_clonability(self): param2 = params_set[name] self.assertEqual(param1, param2) + def test_select_rates_for_classification(self): + """Makes sure that the configuration space of select rates + does not include regression components""" + cs = SimpleClassificationPipeline().get_hyperparameter_search_space() + # This check only makes sense if select rates is a valid choice + self.assertIn('select_rates', + cs.get_hyperparameter('feature_preprocessor:__choice__').choices) + choices = cs.get_hyperparameter('feature_preprocessor:select_rates:score_func').choices + + # Below classification choices should not be valid in regression + self.assertNotIn('f_regression', choices) + self.assertNotIn('mutual_info_regression', choices) + def test_set_params(self): pass diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index d8b753680f..315edcf62d 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -275,7 +275,7 @@ def test_get_hyperparameter_search_space(self): self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() hyperparameters = cs.get_hyperparameters() - self.assertEqual(140, len(hyperparameters)) + self.assertEqual(143, len(hyperparameters)) self.assertEqual(len(hyperparameters) - 6, len(conditions)) def test_get_hyperparameter_search_space_include_exclude_models(self): @@ -452,6 +452,20 @@ def test_pipeline_clonability(self): param2 = params_set[name] self.assertEqual(param1, param2) + def test_select_rates_for_regression(self): + """Makes sure that the configuration space of select rates + does not include classification components""" + cs = SimpleRegressionPipeline().get_hyperparameter_search_space() + # This check only makes sense if select rates is a valid choice + self.assertIn('select_rates', + cs.get_hyperparameter('feature_preprocessor:__choice__').choices) + choices = cs.get_hyperparameter('feature_preprocessor:select_rates:score_func').choices + + # Below classification choices should not be valid in regression + self.assertNotIn('f_classif', choices) + self.assertNotIn('mutual_info_classif', choices) + self.assertNotIn('chi2', choices) + def test_set_params(self): pass From c35275aba0dc2459198710e6187cf44de4e5318d Mon Sep 17 00:00:00 2001 From: chico Date: Sat, 22 Aug 2020 01:28:17 +0200 Subject: [PATCH 3/8] Separate regression/classification --- ...ates.py => select_rates_classification.py} | 55 +------ .../select_rates_regression.py | 126 ++++++++++++++++ .../test_meta_base_data/configurations.csv | 30 ++-- .../test_select_rates.py | 141 ------------------ .../test_select_rates_classification.py | 97 ++++++++++++ .../test_select_rates_regression.py | 86 +++++++++++ test/test_pipeline/test_classification.py | 5 +- test/test_pipeline/test_regression.py | 5 +- 8 files changed, 337 insertions(+), 208 deletions(-) rename autosklearn/pipeline/components/feature_preprocessing/{select_rates.py => select_rates_classification.py} (61%) create mode 100644 autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py delete mode 100644 test/test_pipeline/components/feature_preprocessing/test_select_rates.py create mode 100644 test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py create mode 100644 test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py similarity index 61% rename from autosklearn/pipeline/components/feature_preprocessing/select_rates.py rename to autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py index 2f53e5d7b7..2d8f11619a 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py @@ -7,8 +7,8 @@ from autosklearn.pipeline.constants import SIGNED_DATA, UNSIGNED_DATA, SPARSE, DENSE, INPUT -class SelectRates(AutoSklearnPreprocessingAlgorithm): - def __init__(self, alpha, mode='percentile', +class SelectClassificationRates(AutoSklearnPreprocessingAlgorithm): + def __init__(self, alpha, mode='fpr', score_func="chi2", random_state=None): import sklearn.feature_selection @@ -26,19 +26,9 @@ def __init__(self, alpha, mode='percentile', # Mutual info needs scikit learn default to prevent # running into p_values problem (no pvalue found) self.mode = 'percentile' - elif score_func == "f_regression": - self.score_func = sklearn.feature_selection.f_regression - elif score_func == "mutual_info_regression": - self.score_func = sklearn.feature_selection.mutual_info_regression - # Work Around as SMAC does not handle Not Equal - # Mutual info needs scikit learn default to prevent - # running into p_values problem (no pvalue found) - self.mode = 'percentile' else: raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " "for classification " - "or in ('f_regression, 'mutual_info_regression') " - "for task=regression " "but is: %s " % (score_func)) def fit(self, X, y): @@ -99,12 +89,9 @@ def get_properties(dataset_properties=None): if signed is not None: data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - # This component handles regression and classification. - # It does so by building a hyperparameter search space according - # to the dataset properties return {'shortname': 'SR', 'name': 'Univariate Feature Selection based on rates', - 'handles_regression': True, + 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, @@ -118,27 +105,11 @@ def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1) - target_type = 'classification' - if dataset_properties is not None: - # Whether or not this component supports regression, depends - # on the dataset properties. If the dataset properties is for a regression - # task (target_type==regression) we can build the configuration space - # accordingly - if dataset_properties.get('target_type') is not None: - target_type = dataset_properties.get('target_type') - - if target_type == 'regression': - if dataset_properties is not None and 'sparse' in dataset_properties \ - and dataset_properties['sparse']: - choices = ['mutual_info_regression'] - else: - choices = ['f_regression', 'mutual_info_regression'] + if dataset_properties is not None and 'sparse' in dataset_properties \ + and dataset_properties['sparse']: + choices = ['chi2', 'mutual_info_classif'] else: - if dataset_properties is not None and 'sparse' in dataset_properties \ - and dataset_properties['sparse']: - choices = ['chi2', 'mutual_info_classif'] - else: - choices = ['chi2', 'f_classif', 'mutual_info_classif'] + choices = ['chi2', 'f_classif', 'mutual_info_classif'] score_func = CategoricalHyperparameter( name="score_func", @@ -152,16 +123,4 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) - # In case of mutual info regression, the mode needs to be percentile - # Which is the scikit learn default, else we run into p_values problem - # SMAC Cannot handle OR, so leave this code here for the future. - # Right now, we will have mode in the config space when we - # have mutual_info, yet it is not needed - # if 'mutual_info_regression' in choices: - # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') - # cs.add_condition(cond) - # if 'mutual_info_classif' in choices: - # cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') - # cs.add_condition(cond) - return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py new file mode 100644 index 0000000000..16d6bada58 --- /dev/null +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -0,0 +1,126 @@ +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ + CategoricalHyperparameter + +from autosklearn.pipeline.components.base import \ + AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import SIGNED_DATA, UNSIGNED_DATA, SPARSE, DENSE, INPUT + + +class SelectRegressionRates(AutoSklearnPreprocessingAlgorithm): + def __init__(self, alpha, mode='percentile', + score_func="f_regression", random_state=None): + import sklearn.feature_selection + + self.random_state = random_state # We don't use this + self.alpha = alpha + self.mode = mode + + if score_func == "f_regression": + self.score_func = sklearn.feature_selection.f_regression + elif score_func == "mutual_info_regression": + self.score_func = sklearn.feature_selection.mutual_info_regression + # Work Around as SMAC does not handle Not Equal + # Mutual info needs scikit learn default to prevent + # running into p_values problem (no pvalue found) + self.mode = 'percentile' + else: + raise ValueError("score_func must be in ('f_regression, 'mutual_info_regression') " + "for task=regression " + "but is: %s " % (score_func)) + + def fit(self, X, y): + import scipy.sparse + import sklearn.feature_selection + + self.alpha = float(self.alpha) + + self.preprocessor = sklearn.feature_selection.GenericUnivariateSelect( + score_func=self.score_func, param=self.alpha, mode=self.mode) + + # Because the pipeline guarantees that each feature is positive, + # clip all values below zero to zero + if self.score_func == sklearn.feature_selection.chi2: + if scipy.sparse.issparse(X): + X.data[X.data < 0] = 0.0 + else: + X[X < 0] = 0.0 + + self.preprocessor.fit(X, y) + return self + + def transform(self, X): + + if self.preprocessor is None: + raise NotImplementedError() + try: + Xt = self.preprocessor.transform(X) + except ValueError as e: + if "zero-size array to reduction operation maximum which has no " \ + "identity" in e.message: + raise ValueError( + "%s removed all features." % self.__class__.__name__) + else: + raise e + + if Xt.shape[1] == 0: + raise ValueError( + "%s removed all features." % self.__class__.__name__) + return Xt + + @staticmethod + def get_properties(dataset_properties=None): + data_type = UNSIGNED_DATA + + if dataset_properties is not None: + signed = dataset_properties.get('signed') + if signed is not None: + data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA + + return {'shortname': 'SR', + 'name': 'Univariate Feature Selection based on rates', + 'handles_regression': True, + 'handles_classification': False, + 'handles_multiclass': True, + 'handles_multilabel': False, + 'handles_multioutput': False, + 'is_deterministic': True, + 'input': (SPARSE, DENSE, data_type), + 'output': (INPUT,)} + + @staticmethod + def get_hyperparameter_search_space(dataset_properties=None): + alpha = UniformFloatHyperparameter( + name="alpha", lower=0.01, upper=0.5, default_value=0.1) + + if dataset_properties is not None and 'sparse' in dataset_properties \ + and dataset_properties['sparse']: + choices = ['mutual_info_regression'] + else: + choices = ['f_regression', 'mutual_info_regression'] + + score_func = CategoricalHyperparameter( + name="score_func", + choices=choices, + default_value="f_regression" if 'f_regression' in choices else choices[0]) + + mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') + + cs = ConfigurationSpace() + cs.add_hyperparameter(alpha) + cs.add_hyperparameter(score_func) + cs.add_hyperparameter(mode) + + # In case of mutual info regression, the mode needs to be percentile + # Which is the scikit learn default, else we run into p_values problem + # SMAC Cannot handle OR, so leave this code here for the future. + # Right now, we will have mode in the config space when we + # have mutual_info, yet it is not needed + # if 'mutual_info_regression' in choices: + # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') + # cs.add_condition(cond) + # if 'mutual_info_classif' in choices: + # cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') + # cs.add_condition(cond) + + return cs diff --git a/test/test_metalearning/pyMetaLearn/test_meta_base_data/configurations.csv b/test/test_metalearning/pyMetaLearn/test_meta_base_data/configurations.csv index c5a9d2fca6..8a5921817b 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_base_data/configurations.csv +++ b/test/test_metalearning/pyMetaLearn/test_meta_base_data/configurations.csv @@ -1,10 +1,10 @@ -idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,classifier:adaboost:learning_rate,classifier:adaboost:max_depth,classifier:adaboost:n_estimators,classifier:bernoulli_nb:alpha,classifier:bernoulli_nb:fit_prior,classifier:decision_tree:criterion,classifier:decision_tree:max_depth_factor,classifier:decision_tree:max_features,classifier:decision_tree:max_leaf_nodes,classifier:decision_tree:min_impurity_decrease,classifier:decision_tree:min_samples_leaf,classifier:decision_tree:min_samples_split,classifier:decision_tree:min_weight_fraction_leaf,classifier:extra_trees:bootstrap,classifier:extra_trees:criterion,classifier:extra_trees:max_depth,classifier:extra_trees:max_features,classifier:extra_trees:max_leaf_nodes,classifier:extra_trees:min_impurity_decrease,classifier:extra_trees:min_samples_leaf,classifier:extra_trees:min_samples_split,classifier:extra_trees:min_weight_fraction_leaf,classifier:gradient_boosting:early_stop,classifier:gradient_boosting:l2_regularization,classifier:gradient_boosting:learning_rate,classifier:gradient_boosting:loss,classifier:gradient_boosting:max_bins,classifier:gradient_boosting:max_depth,classifier:gradient_boosting:max_leaf_nodes,classifier:gradient_boosting:min_samples_leaf,classifier:gradient_boosting:n_iter_no_change,classifier:gradient_boosting:scoring,classifier:gradient_boosting:tol,classifier:gradient_boosting:validation_fraction,classifier:k_nearest_neighbors:n_neighbors,classifier:k_nearest_neighbors:p,classifier:k_nearest_neighbors:weights,classifier:lda:n_components,classifier:lda:shrinkage,classifier:lda:shrinkage_factor,classifier:lda:tol,classifier:liblinear_svc:C,classifier:liblinear_svc:dual,classifier:liblinear_svc:fit_intercept,classifier:liblinear_svc:intercept_scaling,classifier:liblinear_svc:loss,classifier:liblinear_svc:multi_class,classifier:liblinear_svc:penalty,classifier:liblinear_svc:tol,classifier:libsvm_svc:C,classifier:libsvm_svc:coef0,classifier:libsvm_svc:degree,classifier:libsvm_svc:gamma,classifier:libsvm_svc:kernel,classifier:libsvm_svc:max_iter,classifier:libsvm_svc:shrinking,classifier:libsvm_svc:tol,classifier:multinomial_nb:alpha,classifier:multinomial_nb:fit_prior,classifier:passive_aggressive:C,classifier:passive_aggressive:average,classifier:passive_aggressive:fit_intercept,classifier:passive_aggressive:loss,classifier:passive_aggressive:tol,classifier:qda:reg_param,classifier:random_forest:bootstrap,classifier:random_forest:criterion,classifier:random_forest:max_depth,classifier:random_forest:max_features,classifier:random_forest:max_leaf_nodes,classifier:random_forest:min_impurity_decrease,classifier:random_forest:min_samples_leaf,classifier:random_forest:min_samples_split,classifier:random_forest:min_weight_fraction_leaf,classifier:sgd:alpha,classifier:sgd:average,classifier:sgd:epsilon,classifier:sgd:eta0,classifier:sgd:fit_intercept,classifier:sgd:l1_ratio,classifier:sgd:learning_rate,classifier:sgd:loss,classifier:sgd:penalty,classifier:sgd:power_t,classifier:sgd:tol,data_preprocessing:categorical_transformer:categorical_encoding:__choice__,data_preprocessing:categorical_transformer:category_coalescence:__choice__,data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,data_preprocessing:numerical_transformer:imputation:strategy,data_preprocessing:numerical_transformer:rescaling:__choice__,data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles,data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution,data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_max,data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_min,feature_preprocessor:__choice__,feature_preprocessor:extra_trees_preproc_for_classification:bootstrap,feature_preprocessor:extra_trees_preproc_for_classification:criterion,feature_preprocessor:extra_trees_preproc_for_classification:max_depth,feature_preprocessor:extra_trees_preproc_for_classification:max_features,feature_preprocessor:extra_trees_preproc_for_classification:max_leaf_nodes,feature_preprocessor:extra_trees_preproc_for_classification:min_impurity_decrease,feature_preprocessor:extra_trees_preproc_for_classification:min_samples_leaf,feature_preprocessor:extra_trees_preproc_for_classification:min_samples_split,feature_preprocessor:extra_trees_preproc_for_classification:min_weight_fraction_leaf,feature_preprocessor:extra_trees_preproc_for_classification:n_estimators,feature_preprocessor:fast_ica:algorithm,feature_preprocessor:fast_ica:fun,feature_preprocessor:fast_ica:n_components,feature_preprocessor:fast_ica:whiten,feature_preprocessor:feature_agglomeration:affinity,feature_preprocessor:feature_agglomeration:linkage,feature_preprocessor:feature_agglomeration:n_clusters,feature_preprocessor:feature_agglomeration:pooling_func,feature_preprocessor:kernel_pca:coef0,feature_preprocessor:kernel_pca:degree,feature_preprocessor:kernel_pca:gamma,feature_preprocessor:kernel_pca:kernel,feature_preprocessor:kernel_pca:n_components,feature_preprocessor:kitchen_sinks:gamma,feature_preprocessor:kitchen_sinks:n_components,feature_preprocessor:liblinear_svc_preprocessor:C,feature_preprocessor:liblinear_svc_preprocessor:dual,feature_preprocessor:liblinear_svc_preprocessor:fit_intercept,feature_preprocessor:liblinear_svc_preprocessor:intercept_scaling,feature_preprocessor:liblinear_svc_preprocessor:loss,feature_preprocessor:liblinear_svc_preprocessor:multi_class,feature_preprocessor:liblinear_svc_preprocessor:penalty,feature_preprocessor:liblinear_svc_preprocessor:tol,feature_preprocessor:nystroem_sampler:coef0,feature_preprocessor:nystroem_sampler:degree,feature_preprocessor:nystroem_sampler:gamma,feature_preprocessor:nystroem_sampler:kernel,feature_preprocessor:nystroem_sampler:n_components,feature_preprocessor:pca:keep_variance,feature_preprocessor:pca:whiten,feature_preprocessor:polynomial:degree,feature_preprocessor:polynomial:include_bias,feature_preprocessor:polynomial:interaction_only,feature_preprocessor:random_trees_embedding:bootstrap,feature_preprocessor:random_trees_embedding:max_depth,feature_preprocessor:random_trees_embedding:max_leaf_nodes,feature_preprocessor:random_trees_embedding:min_samples_leaf,feature_preprocessor:random_trees_embedding:min_samples_split,feature_preprocessor:random_trees_embedding:min_weight_fraction_leaf,feature_preprocessor:random_trees_embedding:n_estimators,feature_preprocessor:select_percentile_classification:percentile,feature_preprocessor:select_percentile_classification:score_func,feature_preprocessor:select_rates:alpha,feature_preprocessor:select_rates:mode,feature_preprocessor:select_rates:score_func +idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,classifier:adaboost:learning_rate,classifier:adaboost:max_depth,classifier:adaboost:n_estimators,classifier:bernoulli_nb:alpha,classifier:bernoulli_nb:fit_prior,classifier:decision_tree:criterion,classifier:decision_tree:max_depth_factor,classifier:decision_tree:max_features,classifier:decision_tree:max_leaf_nodes,classifier:decision_tree:min_impurity_decrease,classifier:decision_tree:min_samples_leaf,classifier:decision_tree:min_samples_split,classifier:decision_tree:min_weight_fraction_leaf,classifier:extra_trees:bootstrap,classifier:extra_trees:criterion,classifier:extra_trees:max_depth,classifier:extra_trees:max_features,classifier:extra_trees:max_leaf_nodes,classifier:extra_trees:min_impurity_decrease,classifier:extra_trees:min_samples_leaf,classifier:extra_trees:min_samples_split,classifier:extra_trees:min_weight_fraction_leaf,classifier:gradient_boosting:early_stop,classifier:gradient_boosting:l2_regularization,classifier:gradient_boosting:learning_rate,classifier:gradient_boosting:loss,classifier:gradient_boosting:max_bins,classifier:gradient_boosting:max_depth,classifier:gradient_boosting:max_leaf_nodes,classifier:gradient_boosting:min_samples_leaf,classifier:gradient_boosting:n_iter_no_change,classifier:gradient_boosting:scoring,classifier:gradient_boosting:tol,classifier:gradient_boosting:validation_fraction,classifier:k_nearest_neighbors:n_neighbors,classifier:k_nearest_neighbors:p,classifier:k_nearest_neighbors:weights,classifier:lda:n_components,classifier:lda:shrinkage,classifier:lda:shrinkage_factor,classifier:lda:tol,classifier:liblinear_svc:C,classifier:liblinear_svc:dual,classifier:liblinear_svc:fit_intercept,classifier:liblinear_svc:intercept_scaling,classifier:liblinear_svc:loss,classifier:liblinear_svc:multi_class,classifier:liblinear_svc:penalty,classifier:liblinear_svc:tol,classifier:libsvm_svc:C,classifier:libsvm_svc:coef0,classifier:libsvm_svc:degree,classifier:libsvm_svc:gamma,classifier:libsvm_svc:kernel,classifier:libsvm_svc:max_iter,classifier:libsvm_svc:shrinking,classifier:libsvm_svc:tol,classifier:multinomial_nb:alpha,classifier:multinomial_nb:fit_prior,classifier:passive_aggressive:C,classifier:passive_aggressive:average,classifier:passive_aggressive:fit_intercept,classifier:passive_aggressive:loss,classifier:passive_aggressive:tol,classifier:qda:reg_param,classifier:random_forest:bootstrap,classifier:random_forest:criterion,classifier:random_forest:max_depth,classifier:random_forest:max_features,classifier:random_forest:max_leaf_nodes,classifier:random_forest:min_impurity_decrease,classifier:random_forest:min_samples_leaf,classifier:random_forest:min_samples_split,classifier:random_forest:min_weight_fraction_leaf,classifier:sgd:alpha,classifier:sgd:average,classifier:sgd:epsilon,classifier:sgd:eta0,classifier:sgd:fit_intercept,classifier:sgd:l1_ratio,classifier:sgd:learning_rate,classifier:sgd:loss,classifier:sgd:penalty,classifier:sgd:power_t,classifier:sgd:tol,data_preprocessing:categorical_transformer:categorical_encoding:__choice__,data_preprocessing:categorical_transformer:category_coalescence:__choice__,data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,data_preprocessing:numerical_transformer:imputation:strategy,data_preprocessing:numerical_transformer:rescaling:__choice__,data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles,data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution,data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_max,data_preprocessing:numerical_transformer:rescaling:robust_scaler:q_min,feature_preprocessor:__choice__,feature_preprocessor:extra_trees_preproc_for_classification:bootstrap,feature_preprocessor:extra_trees_preproc_for_classification:criterion,feature_preprocessor:extra_trees_preproc_for_classification:max_depth,feature_preprocessor:extra_trees_preproc_for_classification:max_features,feature_preprocessor:extra_trees_preproc_for_classification:max_leaf_nodes,feature_preprocessor:extra_trees_preproc_for_classification:min_impurity_decrease,feature_preprocessor:extra_trees_preproc_for_classification:min_samples_leaf,feature_preprocessor:extra_trees_preproc_for_classification:min_samples_split,feature_preprocessor:extra_trees_preproc_for_classification:min_weight_fraction_leaf,feature_preprocessor:extra_trees_preproc_for_classification:n_estimators,feature_preprocessor:fast_ica:algorithm,feature_preprocessor:fast_ica:fun,feature_preprocessor:fast_ica:n_components,feature_preprocessor:fast_ica:whiten,feature_preprocessor:feature_agglomeration:affinity,feature_preprocessor:feature_agglomeration:linkage,feature_preprocessor:feature_agglomeration:n_clusters,feature_preprocessor:feature_agglomeration:pooling_func,feature_preprocessor:kernel_pca:coef0,feature_preprocessor:kernel_pca:degree,feature_preprocessor:kernel_pca:gamma,feature_preprocessor:kernel_pca:kernel,feature_preprocessor:kernel_pca:n_components,feature_preprocessor:kitchen_sinks:gamma,feature_preprocessor:kitchen_sinks:n_components,feature_preprocessor:liblinear_svc_preprocessor:C,feature_preprocessor:liblinear_svc_preprocessor:dual,feature_preprocessor:liblinear_svc_preprocessor:fit_intercept,feature_preprocessor:liblinear_svc_preprocessor:intercept_scaling,feature_preprocessor:liblinear_svc_preprocessor:loss,feature_preprocessor:liblinear_svc_preprocessor:multi_class,feature_preprocessor:liblinear_svc_preprocessor:penalty,feature_preprocessor:liblinear_svc_preprocessor:tol,feature_preprocessor:nystroem_sampler:coef0,feature_preprocessor:nystroem_sampler:degree,feature_preprocessor:nystroem_sampler:gamma,feature_preprocessor:nystroem_sampler:kernel,feature_preprocessor:nystroem_sampler:n_components,feature_preprocessor:pca:keep_variance,feature_preprocessor:pca:whiten,feature_preprocessor:polynomial:degree,feature_preprocessor:polynomial:include_bias,feature_preprocessor:polynomial:interaction_only,feature_preprocessor:random_trees_embedding:bootstrap,feature_preprocessor:random_trees_embedding:max_depth,feature_preprocessor:random_trees_embedding:max_leaf_nodes,feature_preprocessor:random_trees_embedding:min_samples_leaf,feature_preprocessor:random_trees_embedding:min_samples_split,feature_preprocessor:random_trees_embedding:min_weight_fraction_leaf,feature_preprocessor:random_trees_embedding:n_estimators,feature_preprocessor:select_percentile_classification:percentile,feature_preprocessor:select_percentile_classification:score_func,feature_preprocessor:select_rates_classification:alpha,feature_preprocessor:select_rates_classification:mode,feature_preprocessor:select_rates_classification:score_func 1,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.006832691101653281,0.0990420448281782,auto,255,None,58,9,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.047607909209835673,most_frequent,normalize,,,,,feature_agglomeration,,,,,,,,,,,,,,,cosine,average,72,median,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -2,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,2.215660250704945e-08,0.0568967527929491,auto,255,None,74,58,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,most_frequent,none,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.3999502319254789,fwe,f_classif +2,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,2.215660250704945e-08,0.0568967527929491,auto,255,None,74,58,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,most_frequent,none,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.3999502319254789,fwe,f_classif 3,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,entropy,None,0.7811236762098946,None,0.0,15,9,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,none,,,,,extra_trees_preproc_for_classification,False,entropy,None,0.2269858618750471,None,0.0,13,12,0.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -4,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.5804208006044023,None,0.0,5,2,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,median,standardize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.061500733991527654,fdr,f_classif +4,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.5804208006044023,None,0.0,5,2,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,median,standardize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.061500733991527654,fdr,f_classif 5,none,qda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.563056219822946,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.32793677336996485,most_frequent,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -6,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,683.603209772402,-0.7761786661778607,4,1.0146245161392977,poly,-1,True,0.0004729761062000146,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.07556779791699596,most_frequent,standardize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.46651479293540027,fdr,f_classif +6,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,683.603209772402,-0.7761786661778607,4,1.0146245161392977,poly,-1,True,0.0004729761062000146,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.07556779791699596,most_frequent,standardize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.46651479293540027,fdr,f_classif 7,weighting,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.993647974710288,-0.09714179076410145,2,0.10000000000000006,poly,-1,True,0.0011475566557439987,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,mean,robust_scaler,,,0.8124421960026027,0.18251138129426106,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,True,,,,,,,,,,,, 8,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,gini,None,0.31482574716831474,None,0.0,15,2,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,standardize,,,,,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,False,True,1,squared_hinge,ovr,l1,5.5234897124903465e-05,,,,,,,,,,,,,,,,,,,,,, 9,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.8932965286370729,None,0.0,1,2,0.0,,,,,,,,,,,,one_hot_encoding,no_coalescense,,median,minmax,,,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,False,,,,,,,,,,,, @@ -31,22 +31,22 @@ idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,class 30,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.6993161849181185,0.44235005157802176,auto,255,None,270,15,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.0077758033214372,mean,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 31,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.34085742012558995,None,0.0,2,2,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.010251558508210521,most_frequent,normalize,,,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,94.77374433257484,chi2,,, 32,weighting,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23.617163356857034,0.4249652342218557,2,0.03426568422270486,poly,-1,True,0.0003809897288698571,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.004949301030421484,most_frequent,quantile_transformer,847,normal,,,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,3.3404000226016595,False,True,1,squared_hinge,ovr,l1,0.00013812954117187317,,,,,,,,,,,,,,,,,,,,,, -33,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.4751607001217574,None,0.0,2,18,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.0015489667569464098,most_frequent,standardize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.2465775971025667,fpr,chi2 -34,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,valid,0.787172957129578,0.23076913534674612,auto,255,None,8,4,10,loss,1e-07,0.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.002842817334543296,mean,standardize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.2779207466036798,fwe,f_classif +33,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.4751607001217574,None,0.0,2,18,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.0015489667569464098,most_frequent,standardize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.2465775971025667,fpr,chi2 +34,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,valid,0.787172957129578,0.23076913534674612,auto,255,None,8,4,10,loss,1e-07,0.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.002842817334543296,mean,standardize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.2779207466036798,fwe,f_classif 35,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,158.8949863228855,,,0.03920054687126197,rbf,-1,True,0.05469985785058926,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.015996674733825135,most_frequent,quantile_transformer,1033,uniform,,,fast_ica,,,,,,,,,,,parallel,logcosh,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 36,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.1161756717784211,None,0.0,2,2,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,normalize,,,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,False,,,,,,,,,,,, -37,weighting,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,114.37037237306517,,,0.013196149743002957,rbf,-1,False,2.2119982336561568e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.030600209348810598,median,robust_scaler,,,0.8903774541072713,0.14849508114407797,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.10727720089253716,fwe,f_classif +37,weighting,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,114.37037237306517,,,0.013196149743002957,rbf,-1,False,2.2119982336561568e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.030600209348810598,median,robust_scaler,,,0.8903774541072713,0.14849508114407797,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.10727720089253716,fwe,f_classif 38,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25369.899739311986,-0.20538081740449166,,0.007550793530761754,sigmoid,-1,True,0.00014198788135109906,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.009250691729522439,most_frequent,quantile_transformer,1442,uniform,,,fast_ica,,,,,,,,,,,parallel,cube,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 39,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,gini,None,0.742074481485891,None,0.0,1,2,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.010388289410086769,mean,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 40,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.5,None,0.0,1,2,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.0038325481818368653,most_frequent,quantile_transformer,1000,uniform,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,False,,,,,,,,,,,, 41,none,passive_aggressive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.46057831591617715,False,True,hinge,0.04557857428827514,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.00027457445401600137,median,standardize,,,,,extra_trees_preproc_for_classification,True,gini,None,0.48190346970486964,None,0.0,17,18,0.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 42,weighting,lda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,34,auto,,0.00012339000686260981,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.010000000000000004,mean,standardize,,,,,kitchen_sinks,,,,,,,,,,,,,,,,,,,,,,,,0.027161884929113287,3011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -43,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1007.8868860667042,0.7073639177519475,2,0.0009693320195457126,poly,-1,True,0.00048384544670559135,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.017078985265493323,median,quantile_transformer,971,uniform,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.27854669854596986,fpr,f_classif +43,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1007.8868860667042,0.7073639177519475,2,0.0009693320195457126,poly,-1,True,0.00048384544670559135,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.017078985265493323,median,quantile_transformer,971,uniform,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.27854669854596986,fpr,f_classif 44,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.0009039383509168851,0.013859624893482336,auto,255,None,314,166,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.07166826832005445,median,robust_scaler,,,0.8113117119932765,0.22229745700501014,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 45,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,gini,None,0.5916641238089724,None,0.0,1,7,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.014941875096420176,most_frequent,standardize,,,,,extra_trees_preproc_for_classification,True,gini,None,0.6621674571394228,None,0.0,3,11,0.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 46,none,passive_aggressive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.4177635558897493,True,True,hinge,0.00036622547004230247,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.3298639925115399,median,normalize,,,,,kitchen_sinks,,,,,,,,,,,,,,,,,,,,,,,,0.02443001336430177,7802,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 47,weighting,sgd,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.576840761438448e-07,True,,0.0003087686113414944,True,1.0895900532824292e-07,constant,hinge,elasticnet,,0.007781223173502778,no_encoding,minority_coalescer,0.002482961497851837,mean,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -48,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.6149200141024044,None,0.0,3,6,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,normalize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.27403124544524843,fdr,chi2 +48,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.6149200141024044,None,0.0,3,6,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,normalize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.27403124544524843,fdr,chi2 49,weighting,adaboost,SAMME,1.1345415570152533,6,472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,mean,normalize,,,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89.55941314463736,f_classif,,, 50,none,adaboost,SAMME,1.117891964153124,7,350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.099090775365223,mean,standardize,,,,,fast_ica,,,,,,,,,,,parallel,exp,1122,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 51,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,1.260108334347015e-07,0.020450900578038868,auto,255,None,1907,82,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,median,quantile_transformer,1015,normal,,,pca,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.9704315246405552,True,,,,,,,,,,,,,,, @@ -60,7 +60,7 @@ idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,class 59,none,passive_aggressive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.38845176895497546,True,True,hinge,0.07195442121939964,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.009803171174126721,most_frequent,minmax,,,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,True,False,,,,,,,,,,,, 60,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.19548169161642792,None,0.0,10,18,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.0031030790458014663,most_frequent,normalize,,,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,96.55453782974163,f_classif,,, 61,weighting,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4077.0183557137793,,,0.18582946048483806,rbf,-1,True,0.007982841167341137,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.0024292204383546253,mean,robust_scaler,,,0.7925685994397953,0.28082571006541873,feature_agglomeration,,,,,,,,,,,,,,,cosine,average,15,mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -62,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.02145872972690199,0.03842927840160621,auto,255,None,171,8,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.004228524718610471,most_frequent,standardize,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.13472853186492292,fdr,chi2 +62,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.02145872972690199,0.03842927840160621,auto,255,None,171,8,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.004228524718610471,most_frequent,standardize,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.13472853186492292,fdr,chi2 63,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.35459002631952014,None,0.0,5,15,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.010270173676218672,median,quantile_transformer,1807,uniform,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,56.525707700661215,chi2,,, 64,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10091.529330032845,,,0.0011283303013784186,rbf,-1,True,0.006930076959856067,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,minmax,,,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,True,True,,,,,,,,,,,, 65,weighting,sgd,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.7553798077390236e-05,False,,,True,0.5295119133805599,optimal,log,elasticnet,,0.0002846848503288152,no_encoding,minority_coalescer,0.05377825070455988,mean,quantile_transformer,1591,normal,,,fast_ica,,,,,,,,,,,deflation,cube,1400,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, @@ -90,7 +90,7 @@ idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,class 89,weighting,qda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.07578664472529394,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.012596384519267407,median,robust_scaler,,,0.7176883035814098,0.2870577047962274,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,0.6428686651502072,False,True,1,squared_hinge,ovr,l1,2.5546943595340656e-05,,,,,,,,,,,,,,,,,,,,,, 90,weighting,adaboost,SAMME,0.07959216314142419,1,124,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,median,robust_scaler,,,0.7401836136931198,0.2679472228039613,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,18.787733861356816,mutual_info,,, 91,weighting,lda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89,auto,,0.0819425045156221,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.37737820096945385,median,minmax,,,,,nystroem_sampler,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5073162154954842,2,0.004207352122999392,poly,1212,,,,,,,,,,,,,,,,, -92,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.7229728445103076,None,0.0,5,13,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,robust_scaler,,,0.7823020129596692,0.1205596141179452,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.12983623180397538,fwe,f_classif +92,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.7229728445103076,None,0.0,5,13,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,robust_scaler,,,0.7823020129596692,0.1205596141179452,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.12983623180397538,fwe,f_classif 93,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.7982423863663426,None,0.0,11,7,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.003645478141655197,median,robust_scaler,,,0.7651599230489026,0.15388614105871848,feature_agglomeration,,,,,,,,,,,,,,,cosine,average,34,mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 94,none,adaboost,SAMME,0.04534487012126666,9,237,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.009377862051712454,median,minmax,,,,,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,4.281022361344507,False,True,1,squared_hinge,ovr,l1,0.02040524760798526,,,,,,,,,,,,,,,,,,,,,, 95,none,qda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.563056219822946,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.32793677336996485,most_frequent,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, @@ -98,7 +98,7 @@ idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,class 97,none,qda,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.563056219822946,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.32793677336996485,most_frequent,none,,,,,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 98,weighting,adaboost,SAMME,0.24826166093503962,4,203,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.011447514256202326,median,quantile_transformer,949,normal,,,pca,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.7702718499065888,True,,,,,,,,,,,,,,, 99,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,5.295700573535198e-10,0.042756254512807394,auto,255,None,84,14,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.35207536432313746,median,robust_scaler,,,0.7792676238311911,0.28901203457977576,feature_agglomeration,,,,,,,,,,,,,,,cosine,average,248,max,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -100,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.0016445078304079647,0.1997863062244349,auto,255,None,564,15,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,robust_scaler,,,0.75,0.23248947228355937,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.32975511665416357,fdr,f_classif +100,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,0.0016445078304079647,0.1997863062244349,auto,255,None,564,15,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,robust_scaler,,,0.75,0.23248947228355937,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.32975511665416357,fdr,f_classif 101,weighting,passive_aggressive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0019618741335452826,True,True,hinge,0.0004803120822404903,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,most_frequent,standardize,,,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57.38056668131513,mutual_info,,, 102,weighting,adaboost,SAMME.R,0.0190998863782481,7,99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,most_frequent,quantile_transformer,1177,uniform,,,random_trees_embedding,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,7,None,1,3,1.0,97,,,,, 103,weighting,extra_trees,,,,,,,,,,,,,,,False,gini,None,0.8850157429082246,None,0.0,12,9,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.27673478870889345,median,none,,,,,fast_ica,,,,,,,,,,,deflation,cube,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, @@ -111,14 +111,14 @@ idx,balancing:strategy,classifier:__choice__,classifier:adaboost:algorithm,class 110,weighting,adaboost,SAMME.R,0.22665749778830807,7,78,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,most_frequent,quantile_transformer,1000,uniform,,,feature_agglomeration,,,,,,,,,,,,,,,cosine,complete,373,median,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 111,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1689.0860195745497,0.7404917548199534,2,0.011884114654356123,poly,-1,True,4.262566522678876e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.004395163582476699,median,minmax,,,,,random_trees_embedding,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,3,None,13,13,1.0,74,,,,, 112,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,2.859589328406253e-07,0.25392293346701533,auto,255,None,4,80,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,,mean,standardize,,,,,select_percentile_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,65.7135135608214,mutual_info,,, -113,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,valid,9.097196057095871e-06,0.32800101253288033,auto,255,None,53,28,20,loss,1e-07,0.101919468281566,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.008121631984215255,median,robust_scaler,,,0.7654960296398138,0.25,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.07288328235161678,fwe,chi2 +113,none,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,valid,9.097196057095871e-06,0.32800101253288033,auto,255,None,53,28,20,loss,1e-07,0.101919468281566,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.008121631984215255,median,robust_scaler,,,0.7654960296398138,0.25,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.07288328235161678,fwe,chi2 114,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.5463209559127865,None,0.0,12,5,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.025636105021492692,mean,standardize,,,,,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,1.4472785394247571,False,True,1,squared_hinge,ovr,l1,0.00018809455411335498,,,,,,,,,,,,,,,,,,,,,, 115,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,entropy,None,0.07970183198340376,None,0.0,9,9,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.008729901092151533,most_frequent,quantile_transformer,1028,normal,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,True,True,,,,,,,,,,,, -116,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,2.050858257794119e-10,0.0509713008465305,auto,255,None,17,16,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.0006772048110168395,median,none,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.42719682445182733,fdr,f_classif +116,weighting,gradient_boosting,,,,,,,,,,,,,,,,,,,,,,,,off,2.050858257794119e-10,0.0509713008465305,auto,255,None,17,16,,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.0006772048110168395,median,none,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.42719682445182733,fdr,f_classif 117,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.4886932005592788,None,0.0,1,17,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,median,minmax,,,,,liblinear_svc_preprocessor,,,,,,,,,,,,,,,,,,,,,,,,,,2.2673178962517726,False,True,1,squared_hinge,ovr,l1,0.07576775715726437,,,,,,,,,,,,,,,,,,,,,, 118,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1954.4001293172055,0.5941977727413141,3,0.01222672837922025,poly,-1,False,0.000868704184075337,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,0.023611476558497053,most_frequent,quantile_transformer,1312,normal,,,extra_trees_preproc_for_classification,True,entropy,None,0.26038719206370126,None,0.0,6,14,0.0,100,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 119,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,gini,None,0.48772464140872207,None,0.0,1,16,0.0,,,,,,,,,,,,no_encoding,minority_coalescer,0.010000000000000004,most_frequent,normalize,,,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,False,,,,,,,,,,,, -120,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,571.8976418358935,-0.6542106402522795,,5.0850539598583375e-05,sigmoid,-1,False,0.003954814208041632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,median,none,,,,,select_rates,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1378799965815952,fwe,f_classif +120,none,libsvm_svc,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,571.8976418358935,-0.6542106402522795,,5.0850539598583375e-05,sigmoid,-1,False,0.003954814208041632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,,median,none,,,,,select_rates_classification,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1378799965815952,fwe,f_classif 121,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,entropy,None,0.6204291847226782,None,0.0,2,7,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.012802264108301202,most_frequent,normalize,,,,,fast_ica,,,,,,,,,,,deflation,exp,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 122,weighting,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,None,0.7159488281157247,None,0.0,15,3,0.0,,,,,,,,,,,,no_encoding,no_coalescense,,mean,robust_scaler,,,0.7546151696972261,0.25941712940346606,feature_agglomeration,,,,,,,,,,,,,,,cosine,average,33,mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 123,none,random_forest,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,entropy,None,0.4285190453868457,None,0.0,1,2,0.0,,,,,,,,,,,,one_hot_encoding,minority_coalescer,0.49851517731857553,most_frequent,quantile_transformer,958,uniform,,,polynomial,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,False,False,,,,,,,,,,,, diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates.py deleted file mode 100644 index 204445385c..0000000000 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates.py +++ /dev/null @@ -1,141 +0,0 @@ -import unittest - -import numpy as np -import scipy.sparse -import sklearn.preprocessing - -from autosklearn.pipeline.components.feature_preprocessing.select_rates import \ - SelectRates -from autosklearn.pipeline.util import _test_preprocessing, get_dataset - - -class SelectRatesComponentTest(unittest.TestCase): - def test_default_configuration(self): - transformation, original = _test_preprocessing(SelectRates) - self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], 3) - self.assertFalse((transformation == 0).all()) - - transformation, original = _test_preprocessing( - SelectRates, make_sparse=True) - self.assertTrue(scipy.sparse.issparse(transformation)) - self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) - - # Custom preprocessing test to check if clipping to zero works - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') - original_X_train = X_train.copy() - ss = sklearn.preprocessing.StandardScaler() - X_train = ss.fit_transform(X_train) - configuration_space = SelectRates.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default - if default[hp_name] is not None}) - - transformer = preprocessor.fit(X_train, Y_train) - transformation, original = transformer.transform( - X_train), original_X_train - self.assertEqual(transformation.shape[0], original.shape[0]) - # I don't know why it's 52 here and not 32 which would be half of the - # number of features. Seems to be related to a runtime warning raised - # by sklearn - self.assertEqual(transformation.shape[1], 52) - - def test_preprocessing_dtype(self): - # Dense - # np.float32 - X_train, Y_train, X_test, Y_test = get_dataset("iris") - self.assertEqual(X_train.dtype, np.float32) - - configuration_space = SelectRates.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float32) - - # np.float64 - X_train, Y_train, X_test, Y_test = get_dataset("iris") - X_train = X_train.astype(np.float64) - configuration_space = SelectRates.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float64) - - # Sparse - # np.float32 - X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) - self.assertEqual(X_train.dtype, np.float32) - configuration_space = SelectRates.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float32) - - # np.float64 - X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) - X_train = X_train.astype(np.float64) - configuration_space = SelectRates.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float64) - - def test_default_configuration_regression(self): - transformation, original = _test_preprocessing( - SelectRates, - dataset='boston', - task='regression', - ) - self.assertEqual(transformation.shape[0], original.shape[0]) - # From 13 to 12 features - self.assertEqual(transformation.shape[1], 12) - self.assertFalse((transformation == 0).all()) - - def test_preprocessing_dtype_regression(self): - # Dense - # np.float32 - X_train, Y_train, X_test, Y_test = get_dataset("boston") - self.assertEqual(X_train.dtype, np.float32) - - dataset_properties = {'target_type': 'regression'} - - configuration_space = SelectRates.get_hyperparameter_search_space( - dataset_properties - ) - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float32) - - # np.float64 - X_train, Y_train, X_test, Y_test = get_dataset("boston") - X_train = X_train.astype(np.float64) - configuration_space = SelectRates.get_hyperparameter_search_space( - dataset_properties - ) - default = configuration_space.get_default_configuration() - preprocessor = SelectRates(random_state=1, - **{hp_name: default[hp_name] for hp_name in - default}) - preprocessor.fit(X_train, Y_train) - Xt = preprocessor.transform(X_train) - self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py new file mode 100644 index 0000000000..2497b5174a --- /dev/null +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_classification.py @@ -0,0 +1,97 @@ +import unittest + +import numpy as np +import scipy.sparse +import sklearn.preprocessing + +from autosklearn.pipeline.components.feature_preprocessing.select_rates_classification import \ + SelectClassificationRates +from autosklearn.pipeline.util import _test_preprocessing, get_dataset + + +class SelectClassificationRatesComponentTest(unittest.TestCase): + def test_default_configuration(self): + transformation, original = _test_preprocessing(SelectClassificationRates) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], 3) + self.assertFalse((transformation == 0).all()) + + transformation, original = _test_preprocessing( + SelectClassificationRates, make_sparse=True) + self.assertTrue(scipy.sparse.issparse(transformation)) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) + + # Custom preprocessing test to check if clipping to zero works + X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + original_X_train = X_train.copy() + ss = sklearn.preprocessing.StandardScaler() + X_train = ss.fit_transform(X_train) + configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + + preprocessor = SelectClassificationRates(random_state=1, + **{hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None}) + + transformer = preprocessor.fit(X_train, Y_train) + transformation, original = transformer.transform( + X_train), original_X_train + self.assertEqual(transformation.shape[0], original.shape[0]) + # I don't know why it's 52 here and not 32 which would be half of the + # number of features. Seems to be related to a runtime warning raised + # by sklearn + self.assertEqual(transformation.shape[1], 52) + + def test_preprocessing_dtype(self): + # Dense + # np.float32 + X_train, Y_train, X_test, Y_test = get_dataset("iris") + self.assertEqual(X_train.dtype, np.float32) + + configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = SelectClassificationRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float32) + + # np.float64 + X_train, Y_train, X_test, Y_test = get_dataset("iris") + X_train = X_train.astype(np.float64) + configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = SelectClassificationRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float64) + + # Sparse + # np.float32 + X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) + self.assertEqual(X_train.dtype, np.float32) + configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = SelectClassificationRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float32) + + # np.float64 + X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True) + X_train = X_train.astype(np.float64) + configuration_space = SelectClassificationRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = SelectClassificationRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py new file mode 100644 index 0000000000..a8e7659d5c --- /dev/null +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py @@ -0,0 +1,86 @@ +import unittest + +import numpy as np +import scipy.sparse +import sklearn.preprocessing + +from autosklearn.pipeline.components.feature_preprocessing.select_rates_regression import \ + SelectRegressionRates +from autosklearn.pipeline.util import _test_preprocessing, get_dataset + + +class SelectRegressionRatesComponentTest(unittest.TestCase): + def test_default_configuration(self): + transformation, original = _test_preprocessing(SelectRegressionRates) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], 4) + self.assertFalse((transformation == 0).all()) + + transformation, original = _test_preprocessing( + SelectRegressionRates, make_sparse=True) + self.assertTrue(scipy.sparse.issparse(transformation)) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) + + # Custom preprocessing test to check if clipping to zero works + X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') + original_X_train = X_train.copy() + ss = sklearn.preprocessing.StandardScaler() + X_train = ss.fit_transform(X_train) + configuration_space = SelectRegressionRates.get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + + preprocessor = SelectRegressionRates(random_state=1, + **{hp_name: default[hp_name] + for hp_name in default + if default[hp_name] is not None}) + + transformer = preprocessor.fit(X_train, Y_train) + transformation, original = transformer.transform( + X_train), original_X_train + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertEqual(transformation.shape[1], 21) + + def test_default_configuration_regression(self): + transformation, original = _test_preprocessing( + SelectRegressionRates, + dataset='boston', + task='regression', + ) + self.assertEqual(transformation.shape[0], original.shape[0]) + # From 13 to 12 features + self.assertEqual(transformation.shape[1], 12) + self.assertFalse((transformation == 0).all()) + + def test_preprocessing_dtype_regression(self): + # Dense + # np.float32 + X_train, Y_train, X_test, Y_test = get_dataset("boston") + self.assertEqual(X_train.dtype, np.float32) + + dataset_properties = {'target_type': 'regression'} + + configuration_space = SelectRegressionRates.get_hyperparameter_search_space( + dataset_properties + ) + default = configuration_space.get_default_configuration() + preprocessor = SelectRegressionRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float32) + + # np.float64 + X_train, Y_train, X_test, Y_test = get_dataset("boston") + X_train = X_train.astype(np.float64) + configuration_space = SelectRegressionRates.get_hyperparameter_search_space( + dataset_properties + ) + default = configuration_space.get_default_configuration() + preprocessor = SelectRegressionRates(random_state=1, + **{hp_name: default[hp_name] for hp_name in + default}) + preprocessor.fit(X_train, Y_train) + Xt = preprocessor.transform(X_train) + self.assertEqual(Xt.dtype, np.float64) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index dbf07563af..6c02ed742f 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -659,9 +659,10 @@ def test_select_rates_for_classification(self): does not include regression components""" cs = SimpleClassificationPipeline().get_hyperparameter_search_space() # This check only makes sense if select rates is a valid choice - self.assertIn('select_rates', + self.assertIn('select_rates_classification', cs.get_hyperparameter('feature_preprocessor:__choice__').choices) - choices = cs.get_hyperparameter('feature_preprocessor:select_rates:score_func').choices + choices = cs.get_hyperparameter( + 'feature_preprocessor:select_rates_classification:score_func').choices # Below classification choices should not be valid in regression self.assertNotIn('f_regression', choices) diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index 315edcf62d..2d493e4364 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -457,9 +457,10 @@ def test_select_rates_for_regression(self): does not include classification components""" cs = SimpleRegressionPipeline().get_hyperparameter_search_space() # This check only makes sense if select rates is a valid choice - self.assertIn('select_rates', + self.assertIn('select_rates_regression', cs.get_hyperparameter('feature_preprocessor:__choice__').choices) - choices = cs.get_hyperparameter('feature_preprocessor:select_rates:score_func').choices + choices = cs.get_hyperparameter( + 'feature_preprocessor:select_rates_regression:score_func').choices # Below classification choices should not be valid in regression self.assertNotIn('f_classif', choices) From ecb5678d82c75a82c1d235115ca205a6ccfd8a86 Mon Sep 17 00:00:00 2001 From: chico Date: Sun, 23 Aug 2020 12:48:45 +0200 Subject: [PATCH 4/8] Relax regression score in test_regression --- test/test_automl/test_estimators.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index a67d4ec280..7b7a296490 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -709,7 +709,9 @@ def test_regression(self): self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average - self.assertGreaterEqual(score, -30) + # Results with select rates drops avg score to -32.40, on 30 seconds + # constraint. With more time_left_for_this_task this is no longer an issue + self.assertGreaterEqual(score, -34) def test_cv_regression(self): """ @@ -733,7 +735,9 @@ def test_cv_regression(self): self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average - self.assertGreaterEqual(score, -30) + # Results with select rates drops avg score to -32.40, on 30 seconds + # constraint. With more time_left_for_this_task this is no longer an issue + self.assertGreaterEqual(score, -34) self._tearDown(tmp) self._tearDown(output) From d4e5d258cb9663f55d9c842847a28a07d3ba7713 Mon Sep 17 00:00:00 2001 From: chico Date: Sun, 23 Aug 2020 16:56:52 +0200 Subject: [PATCH 5/8] relax regression score --- test/test_automl/test_estimators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 7b7a296490..c29bc81573 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -709,9 +709,9 @@ def test_regression(self): self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average - # Results with select rates drops avg score to -32.40, on 30 seconds + # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue - self.assertGreaterEqual(score, -34) + self.assertGreaterEqual(score, -37) def test_cv_regression(self): """ @@ -735,9 +735,9 @@ def test_cv_regression(self): self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average - # Results with select rates drops avg score to -32.40, on 30 seconds + # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue - self.assertGreaterEqual(score, -34) + self.assertGreaterEqual(score, -37) self._tearDown(tmp) self._tearDown(output) From fe52d8973152c1ef43cebfac67d68a9987d9ba9c Mon Sep 17 00:00:00 2001 From: chico Date: Sun, 23 Aug 2020 18:34:27 +0200 Subject: [PATCH 6/8] mutual info regression degrading performance on non sparse data --- .../feature_preprocessing/select_rates_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py index 16d6bada58..23577c6056 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -95,9 +95,9 @@ def get_hyperparameter_search_space(dataset_properties=None): if dataset_properties is not None and 'sparse' in dataset_properties \ and dataset_properties['sparse']: - choices = ['mutual_info_regression'] + choices = ['mutual_info_regression', 'f_regression'] else: - choices = ['f_regression', 'mutual_info_regression'] + choices = ['f_regression'] score_func = CategoricalHyperparameter( name="score_func", From f1c2d8ad2ca3709c84722ddd1cfd884fde464ff4 Mon Sep 17 00:00:00 2001 From: chico Date: Wed, 26 Aug 2020 20:42:50 +0200 Subject: [PATCH 7/8] Incorporated comments from PR --- .../select_rates_classification.py | 15 ++++--- .../select_rates_regression.py | 40 ++++--------------- autosklearn/pipeline/util.py | 16 ++------ .../test_select_rates_regression.py | 3 +- test/test_pipeline/test_classification.py | 14 ------- test/test_pipeline/test_regression.py | 15 ------- 6 files changed, 22 insertions(+), 81 deletions(-) diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py index 2d8f11619a..2f5acc0d16 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py @@ -22,9 +22,7 @@ def __init__(self, alpha, mode='fpr', self.score_func = sklearn.feature_selection.f_classif elif score_func == "mutual_info_classif": self.score_func = sklearn.feature_selection.mutual_info_classif - # Work Around as SMAC does not handle Not Equal - # Mutual info needs scikit learn default to prevent - # running into p_values problem (no pvalue found) + # mutual info classif constantly crashes without mode percentile self.mode = 'percentile' else: raise ValueError("score_func must be in ('chi2, 'f_classif', 'mutual_info_classif') " @@ -105,8 +103,7 @@ def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1) - if dataset_properties is not None and 'sparse' in dataset_properties \ - and dataset_properties['sparse']: + if dataset_properties is not None and dataset_properties.get('sparse'): choices = ['chi2', 'mutual_info_classif'] else: choices = ['chi2', 'f_classif', 'mutual_info_classif'] @@ -114,7 +111,7 @@ def get_hyperparameter_search_space(dataset_properties=None): score_func = CategoricalHyperparameter( name="score_func", choices=choices, - default_value="chi2" if 'chi2' in choices else choices[0]) + default_value="chi2") mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') @@ -123,4 +120,10 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) + # TODO: Add when smac supports NotEqualConditionyy + # mutual_info_classif constantly crashes if mode is not percentile + # as a WA, fix the mode for this score + #cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') + #cs.add_condition(cond) + return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py index 23577c6056..b388f620ff 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -20,9 +20,7 @@ def __init__(self, alpha, mode='percentile', self.score_func = sklearn.feature_selection.f_regression elif score_func == "mutual_info_regression": self.score_func = sklearn.feature_selection.mutual_info_regression - # Work Around as SMAC does not handle Not Equal - # Mutual info needs scikit learn default to prevent - # running into p_values problem (no pvalue found) + # Mutual info consistently crashes if percentile is not the mode self.mode = 'percentile' else: raise ValueError("score_func must be in ('f_regression, 'mutual_info_regression') " @@ -38,14 +36,6 @@ def fit(self, X, y): self.preprocessor = sklearn.feature_selection.GenericUnivariateSelect( score_func=self.score_func, param=self.alpha, mode=self.mode) - # Because the pipeline guarantees that each feature is positive, - # clip all values below zero to zero - if self.score_func == sklearn.feature_selection.chi2: - if scipy.sparse.issparse(X): - X.data[X.data < 0] = 0.0 - else: - X[X < 0] = 0.0 - self.preprocessor.fit(X, y) return self @@ -70,13 +60,6 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - data_type = UNSIGNED_DATA - - if dataset_properties is not None: - signed = dataset_properties.get('signed') - if signed is not None: - data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA - return {'shortname': 'SR', 'name': 'Univariate Feature Selection based on rates', 'handles_regression': True, @@ -85,7 +68,7 @@ def get_properties(dataset_properties=None): 'handles_multilabel': False, 'handles_multioutput': False, 'is_deterministic': True, - 'input': (SPARSE, DENSE, data_type), + 'input': (SPARSE, DENSE, UNSIGNED_DATA), 'output': (INPUT,)} @staticmethod @@ -93,8 +76,7 @@ def get_hyperparameter_search_space(dataset_properties=None): alpha = UniformFloatHyperparameter( name="alpha", lower=0.01, upper=0.5, default_value=0.1) - if dataset_properties is not None and 'sparse' in dataset_properties \ - and dataset_properties['sparse']: + if dataset_properties is not None and dataset_properties.get('sparse'): choices = ['mutual_info_regression', 'f_regression'] else: choices = ['f_regression'] @@ -102,7 +84,7 @@ def get_hyperparameter_search_space(dataset_properties=None): score_func = CategoricalHyperparameter( name="score_func", choices=choices, - default_value="f_regression" if 'f_regression' in choices else choices[0]) + default_value="f_regression") mode = CategoricalHyperparameter('mode', ['fpr', 'fdr', 'fwe'], 'fpr') @@ -111,16 +93,10 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) - # In case of mutual info regression, the mode needs to be percentile - # Which is the scikit learn default, else we run into p_values problem - # SMAC Cannot handle OR, so leave this code here for the future. - # Right now, we will have mode in the config space when we - # have mutual_info, yet it is not needed + # TODO: Add when SMAC supports not equal condition + # Mutual info consistently crashes if percentile is not the mode # if 'mutual_info_regression' in choices: - # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') - # cs.add_condition(cond) - # if 'mutual_info_classif' in choices: - # cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') - # cs.add_condition(cond) + # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') + # cs.add_condition(cond) return cs diff --git a/autosklearn/pipeline/util.py b/autosklearn/pipeline/util.py index 6e1f3aaf2d..a2dfaa1a6c 100644 --- a/autosklearn/pipeline/util.py +++ b/autosklearn/pipeline/util.py @@ -169,25 +169,17 @@ def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, def _test_preprocessing(Preprocessor, dataset='iris', make_sparse=False, - train_size_maximum=150, task=None): + train_size_maximum=150): X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=make_sparse, train_size_maximum=train_size_maximum) original_X_train = X_train.copy() - - if task is not None: - dataset_properties = {'target_type': task} - else: - dataset_properties = None - - configuration_space = Preprocessor.get_hyperparameter_search_space(dataset_properties) + configuration_space = Preprocessor.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() - kwargs = {hp_name: default[hp_name] for hp_name in - default if default[hp_name] is not None} - preprocessor = Preprocessor(random_state=np.random.RandomState(1), - **kwargs) + **{hp_name: default[hp_name] for hp_name in + default if default[hp_name] is not None}) transformer = preprocessor.fit(X_train, Y_train) return transformer.transform(X_train), original_X_train diff --git a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py index a8e7659d5c..573bab32ce 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py +++ b/test/test_pipeline/components/feature_preprocessing/test_select_rates_regression.py @@ -22,7 +22,7 @@ def test_default_configuration(self): self.assertEqual(transformation.shape[0], original.shape[0]) self.assertEqual(transformation.shape[1], int(original.shape[1] / 2)) - # Custom preprocessing test to check if clipping to zero works + # Makes sure that the features are reduced, not the number of samples X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') original_X_train = X_train.copy() ss = sklearn.preprocessing.StandardScaler() @@ -45,7 +45,6 @@ def test_default_configuration_regression(self): transformation, original = _test_preprocessing( SelectRegressionRates, dataset='boston', - task='regression', ) self.assertEqual(transformation.shape[0], original.shape[0]) # From 13 to 12 features diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 6c02ed742f..212971f0c9 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -654,20 +654,6 @@ def test_pipeline_clonability(self): param2 = params_set[name] self.assertEqual(param1, param2) - def test_select_rates_for_classification(self): - """Makes sure that the configuration space of select rates - does not include regression components""" - cs = SimpleClassificationPipeline().get_hyperparameter_search_space() - # This check only makes sense if select rates is a valid choice - self.assertIn('select_rates_classification', - cs.get_hyperparameter('feature_preprocessor:__choice__').choices) - choices = cs.get_hyperparameter( - 'feature_preprocessor:select_rates_classification:score_func').choices - - # Below classification choices should not be valid in regression - self.assertNotIn('f_regression', choices) - self.assertNotIn('mutual_info_regression', choices) - def test_set_params(self): pass diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index 2d493e4364..e6450d2275 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -452,21 +452,6 @@ def test_pipeline_clonability(self): param2 = params_set[name] self.assertEqual(param1, param2) - def test_select_rates_for_regression(self): - """Makes sure that the configuration space of select rates - does not include classification components""" - cs = SimpleRegressionPipeline().get_hyperparameter_search_space() - # This check only makes sense if select rates is a valid choice - self.assertIn('select_rates_regression', - cs.get_hyperparameter('feature_preprocessor:__choice__').choices) - choices = cs.get_hyperparameter( - 'feature_preprocessor:select_rates_regression:score_func').choices - - # Below classification choices should not be valid in regression - self.assertNotIn('f_classif', choices) - self.assertNotIn('mutual_info_classif', choices) - self.assertNotIn('chi2', choices) - def test_set_params(self): pass From 0adea8aec9b2b02953449bbb4bab5e005bfeb903 Mon Sep 17 00:00:00 2001 From: chico Date: Fri, 28 Aug 2020 17:16:39 +0200 Subject: [PATCH 8/8] moved from pcs to json --- autosklearn/automl.py | 12 +++++++----- .../select_rates_classification.py | 6 +++--- .../feature_preprocessing/select_rates_regression.py | 11 +++++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 2b4265db95..6616313d59 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -7,7 +7,7 @@ import unittest.mock import warnings -from ConfigSpace.read_and_write import pcs +from ConfigSpace.read_and_write import json as cs_json import numpy as np import numpy.ma as ma import pandas as pd @@ -1008,7 +1008,7 @@ def _create_search_space(self, tmp_dir, backend, datamanager, task_name = 'CreateConfigSpace' self._stopwatch.start_task(task_name) - configspace_path = os.path.join(tmp_dir, 'space.pcs') + configspace_path = os.path.join(tmp_dir, 'space.json') configuration_space = pipeline.get_configuration_space( datamanager.info, include_estimators=include_estimators, @@ -1017,9 +1017,11 @@ def _create_search_space(self, tmp_dir, backend, datamanager, exclude_preprocessors=exclude_preprocessors) configuration_space = self.configuration_space_created_hook( datamanager, configuration_space) - sp_string = pcs.write(configuration_space) - backend.write_txt_file(configspace_path, sp_string, - 'Configuration space') + backend.write_txt_file( + configspace_path, + cs_json.write(configuration_space), + 'Configuration space' + ) self._stopwatch.stop_task(task_name) return configuration_space, configspace_path diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py index 2f5acc0d16..2700b81229 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_classification.py @@ -1,6 +1,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ CategoricalHyperparameter +from ConfigSpace import NotEqualsCondition from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm @@ -120,10 +121,9 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) - # TODO: Add when smac supports NotEqualConditionyy # mutual_info_classif constantly crashes if mode is not percentile # as a WA, fix the mode for this score - #cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') - #cs.add_condition(cond) + cond = NotEqualsCondition(mode, score_func, 'mutual_info_classif') + cs.add_condition(cond) return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py index b388f620ff..238eaed3c1 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates_regression.py @@ -1,10 +1,11 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \ CategoricalHyperparameter +from ConfigSpace import NotEqualsCondition from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm -from autosklearn.pipeline.constants import SIGNED_DATA, UNSIGNED_DATA, SPARSE, DENSE, INPUT +from autosklearn.pipeline.constants import UNSIGNED_DATA, SPARSE, DENSE, INPUT class SelectRegressionRates(AutoSklearnPreprocessingAlgorithm): @@ -28,7 +29,6 @@ def __init__(self, alpha, mode='percentile', "but is: %s " % (score_func)) def fit(self, X, y): - import scipy.sparse import sklearn.feature_selection self.alpha = float(self.alpha) @@ -93,10 +93,9 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameter(score_func) cs.add_hyperparameter(mode) - # TODO: Add when SMAC supports not equal condition # Mutual info consistently crashes if percentile is not the mode - # if 'mutual_info_regression' in choices: - # cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') - # cs.add_condition(cond) + if 'mutual_info_regression' in choices: + cond = NotEqualsCondition(mode, score_func, 'mutual_info_regression') + cs.add_condition(cond) return cs