diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 2b6febb30b..4a251b7115 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -408,7 +408,7 @@ def fit(self, X, y, """Fit *auto-sklearn* to given training set (X, y). Fit both optimizes the machine learning models and builds an ensemble - out of them. To disable ensembling, set ``ensemble_size==1``. + out of them. To disable ensembling, set ``ensemble_size==0``. Parameters ---------- @@ -512,7 +512,7 @@ def fit(self, X, y, """Fit *Auto-sklearn* to given training set (X, y). Fit both optimizes the machine learning models and builds an ensemble - out of them. To disable ensembling, set ``ensemble_size==1``. + out of them. To disable ensembling, set ``ensemble_size==0``. Parameters ---------- diff --git a/doc/Makefile b/doc/Makefile index f4b7bf3ce2..9355370597 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -51,6 +51,8 @@ help: clean: rm -rf $(BUILDDIR)/* rm -rf generated + rm -rf examples/ + rm -rf gen_modules/ html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html diff --git a/doc/api.rst b/doc/api.rst index 1ade6bcd8a..34107e78a7 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -103,3 +103,5 @@ Extension Interfaces .. autoclass:: autosklearn.pipeline.components.base.AutoSklearnPreprocessingAlgorithm :members: + + diff --git a/doc/conf.py b/doc/conf.py index 6b93d98691..0e650b4f69 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -41,12 +41,37 @@ extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinx_gallery.gen_gallery', 'sphinx.ext.autosectionlabel', + # sphinx.ext.autosexctionlabel raises duplicate label warnings + # because same section headers are used multiple times throughout + # the documentation. 'numpydoc'] + +from sphinx_gallery.sorting import ExplicitOrder, FileNameSortKey + # Configure the extensions numpydoc_show_class_members = False autosummary_generate = True +# prefix each section label with the name of the document it is in, in order to avoid +# ambiguity when there are multiple same section labels in different documents. +autosectionlabel_prefix_document = True + +# Sphinx-gallery configuration. +sphinx_gallery_conf = { + # path to the examples + 'examples_dirs': '../examples', + # path where to save gallery generated examples + 'gallery_dirs': 'examples', + #TODO: fix back/forward references for the examples. + #'doc_module': ('autosklearn'), + #'reference_url': { + # 'autosklearn': None + #}, + #'backreferences_dir': 'gen_modules/backreferences' +} + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -134,6 +159,7 @@ ('Releases', 'releases'), ('Installation', 'installation'), ('Manual', 'manual'), + ('Examples', 'examples/index'), ('API', 'api'), ('Extending', 'extending'), ], diff --git a/doc/index.rst b/doc/index.rst index bb4826625d..4af101979b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -47,7 +47,7 @@ Manual * :ref:`installation` * :ref:`manual` -* :ref:`API` +* :ref:`api` * :ref:`extending` diff --git a/doc/manual.rst b/doc/manual.rst index c9f581def7..256452077a 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -15,13 +15,15 @@ Examples *auto-sklearn* comes with the following examples which demonstrate several aspects of its usage: -* `Holdout `_ -* `Cross-validation `_ -* `Parallel usage `_ -* `Sequential usage `_ -* `Regression `_ -* `Continuous and categorical data `_ -* `Using custom metrics `_ +* `Holdout `_ +* `Cross-validation `_ +* `Parallel usage `_ +* `Sequential usage `_ +* `Regression `_ +* `Continuous and categorical data `_ +* `Using custom metrics `_ +* `Random search `_ +* `EIPS `_ Time and memory limits diff --git a/examples/README.txt b/examples/README.txt new file mode 100644 index 0000000000..2878fe2917 --- /dev/null +++ b/examples/README.txt @@ -0,0 +1,9 @@ +:orphan: + +.. _examples: + +======== +Examples +======== + +General introductory examples for *auto-sklearn* can be found here. diff --git a/example/example_crossvalidation.py b/examples/example_crossvalidation.py similarity index 79% rename from example/example_crossvalidation.py rename to examples/example_crossvalidation.py index 06299b48db..85530b591b 100644 --- a/example/example_crossvalidation.py +++ b/examples/example_crossvalidation.py @@ -1,4 +1,18 @@ # -*- encoding: utf-8 -*- + + +""" +================ +Cross-Validation +================ + +In *auto-sklearn* it is possible to use different resampling strategies +by specifying the arguments ``resampling_strategy`` and +``resampling_strategy_arguments``. The following example shows how to use +cross-validation and how to set the folds when instantiating +``AutoSklearnClassifier``. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics diff --git a/example/example_eips.py b/examples/example_eips.py similarity index 98% rename from example/example_eips.py rename to examples/example_eips.py index 3e0019c4c2..5223b1db04 100644 --- a/example/example_eips.py +++ b/examples/example_eips.py @@ -1,3 +1,11 @@ +""" +==== +EIPS +==== + +Example description goes here. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics diff --git a/example/example_feature_types.py b/examples/example_feature_types.py similarity index 83% rename from example/example_feature_types.py rename to examples/example_feature_types.py index 53b0b70b07..c33f77c195 100644 --- a/example/example_feature_types.py +++ b/examples/example_feature_types.py @@ -1,4 +1,14 @@ # -*- encoding: utf-8 -*- +""" +============= +Feature Types +============= + +In *auto-sklearn* it is possible to specify the feature types of a dataset when +calling the method :meth:`fit() ` by specifying the argument ``feat_type``. +The following example demonstrates a way it can be done. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics diff --git a/example/example_holdout.py b/examples/example_holdout.py similarity index 62% rename from example/example_holdout.py rename to examples/example_holdout.py index a17707aac1..fe1ff1c7a7 100644 --- a/example/example_holdout.py +++ b/examples/example_holdout.py @@ -1,3 +1,15 @@ +""" +======= +Holdout +======= + +In *auto-sklearn* it is possible to use different resampling strategies +by specifying the arguments ``resampling_strategy`` and +``resampling_strategy_arguments``. The following example shows how to use the +holdout method as well as set the train-test split ratio when instantiating +``AutoSklearnClassifier``. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics @@ -16,6 +28,11 @@ def main(): tmp_folder='/tmp/autosklearn_holdout_example_tmp', output_folder='/tmp/autosklearn_holdout_example_out', disable_evaluator_output=False, + # 'holdout' with 'train_size'=0.67 is the default argument setting + # for AutoSklearnClassifier. It is explicitly specified in this example + # for demonstrational purpose. + resampling_strategy='holdout', + resampling_strategy_arguments={'train_size': 0.67} ) automl.fit(X_train, y_train, dataset_name='digits') diff --git a/example/example_metrics.py b/examples/example_metrics.py similarity index 58% rename from example/example_metrics.py rename to examples/example_metrics.py index dd306e6c63..ebd86ec0e0 100644 --- a/example/example_metrics.py +++ b/examples/example_metrics.py @@ -1,4 +1,16 @@ # -*- encoding: utf-8 -*- +""" +======= +Metrics +======= + +*Auto-sklearn* supports various built-in metrics, which can be found in the +:ref:`metrics section in the API `. However, it is also +possible to define your own metric and use it to fit and evaluate your model. +The following examples show how to use built-in and self-defined metrics for a +classification problem. +""" + import numpy as np import sklearn.model_selection @@ -11,15 +23,24 @@ def accuracy(solution, prediction): - # function defining accuracy + # custom function defining accuracy return np.mean(solution == prediction) +def error(solution, prediction): + # custom function defining error + return np.mean(solution != prediction) + def accuracy_wk(solution, prediction, dummy): - # function defining accuracy and accepting an additional argument + # custom function defining accuracy and accepting an additional argument assert dummy is None return np.mean(solution == prediction) +def error_wk(solution, prediction, dummy): + # custom function defining error and accepting an additional argument + assert dummy is None + return np.mean(solution != prediction) + def main(): @@ -72,6 +93,28 @@ def main(): format(sklearn.metrics.accuracy_score(y_test, predictions), cls._automl._metric.name)) + print("#"*80) + print("Use self defined error metric") + error_rate = autosklearn.metrics.make_scorer( + name='error', + score_func=error, + optimum=0, + greater_is_better=False, + needs_proba=False, + needs_threshold=False + ) + cls = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=60, + per_run_time_limit=30, + seed=1 + ) + cls.fit(X_train, y_train, metric=error_rate) + + cls.predictions = cls.predict(X_test) + print("Error rate {:g} using {:s}". + format(error_rate(y_test, predictions), + cls._automl._metric.name)) + # Third example: Use own accuracy metric with additional argument print("#"*80) print("Use self defined accuracy with additional argument") @@ -99,6 +142,31 @@ def main(): ) ) + print("#"*80) + print("Use self defined error with additional argument") + error_rate = autosklearn.metrics.make_scorer( + name="error_add", + score_func=error_wk, + optimum=0, + greater_is_better=True, + needs_proba=False, + needs_threshold=False, + dummy=None, + ) + cls = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=60, + per_run_time_limit=30, + seed=1, + ) + cls.fit(X_train, y_train, metric=error_rate) + + predictions = cls.predict(X_test) + print( + "Error rate {:g} using {:s}".format( + error_rate(y_test, predictions), + cls._automl._metric.name + ) + ) if __name__ == "__main__": main() diff --git a/example/example_parallel.py b/examples/example_parallel.py similarity index 87% rename from example/example_parallel.py rename to examples/example_parallel.py index 4aacbaf0fe..f5572ab97d 100644 --- a/example/example_parallel.py +++ b/examples/example_parallel.py @@ -1,4 +1,18 @@ # -*- encoding: utf-8 -*- +""" +==================== +Parallel Usage +==================== + +*Auto-sklearn* uses *SMAC* to automatically optimize the hyperparameters of +the training models. A variant of *SMAC*, called *pSMAC* (parallel SMAC), +provides a means of running several instances of *auto-sklearn* in a parallel +mode using several computational resources (detailed information of +*pSMAC* can be found `here `_). +This example shows the necessary steps to configure *auto-sklearn* in +parallel mode. +""" + import multiprocessing import shutil @@ -62,8 +76,8 @@ def spawn_classifier(seed, dataset_name): return spawn_classifier -if __name__ == '__main__': - +def main(): + X, y = sklearn.datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) @@ -106,3 +120,7 @@ def spawn_classifier(seed, dataset_name): predictions = automl.predict(X_test) print(automl.show_models()) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) + + +if __name__ == '__main__': + main() diff --git a/example/example_random_search.py b/examples/example_random_search.py similarity index 87% rename from example/example_random_search.py rename to examples/example_random_search.py index 1c62a190dd..5759d9af0e 100644 --- a/example/example_random_search.py +++ b/examples/example_random_search.py @@ -1,3 +1,14 @@ +""" +============= +Random Search +============= + +A crucial feature of *auto-sklearn* is automatically optimizing the hyperparameters +through SMAC, introduced `here `_. +Additionally, it is possible to use `random search `_ +instead of SMAC, as demonstrated in the example below. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics @@ -17,9 +28,7 @@ def get_roar_object_callback( runhistory, run_id, ): - """Random online adaptive racing. - - http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" + """Random online adaptive racing.""" scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob() scenario = Scenario(scenario_dict) return ROAR( @@ -40,9 +49,7 @@ def get_random_search_object_callback( runhistory, run_id, ): - """Random search. - - http://www.jmlr.org/papers/v13/bergstra12a.html""" + """Random search.""" scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob() scenario_dict['minR'] = len(scenario_dict['instances']) scenario_dict['initial_incumbent'] = 'RANDOM' diff --git a/example/example_regression.py b/examples/example_regression.py similarity index 88% rename from example/example_regression.py rename to examples/example_regression.py index 9420d88f83..79b1a18f08 100644 --- a/example/example_regression.py +++ b/examples/example_regression.py @@ -1,11 +1,18 @@ # -*- encoding: utf-8 -*- +""" +========== +Regression +========== + +The following example shows how to fit a simple regression model with +*auto-sklearn*. +""" import sklearn.model_selection import sklearn.datasets import sklearn.metrics import autosklearn.regression - def main(): X, y = sklearn.datasets.load_boston(return_X_y=True) feature_types = (['numerical'] * 3) + ['categorical'] + (['numerical'] * 9) diff --git a/example/example_sequential.py b/examples/example_sequential.py similarity index 80% rename from example/example_sequential.py rename to examples/example_sequential.py index 019ad3be9c..06820e7ebe 100644 --- a/example/example_sequential.py +++ b/examples/example_sequential.py @@ -1,3 +1,14 @@ +""" +================ +Sequential Usage +================ + +By default, *auto-sklearn* fits the machine learning models and build their +ensembles in parallel. However, it is also possible to run the two processes +sequentially. The example below shows how to first fit the models and build the +ensembles afterwards. +""" + import sklearn.model_selection import sklearn.datasets import sklearn.metrics