From 5189c91760d2fc626a36dab26696f469c5435399 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 11:26:33 +0200 Subject: [PATCH 01/12] Document #217, vanilla auto-sklearn --- doc/extending.rst | 2 +- doc/manual.rst | 35 ++++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/doc/extending.rst b/doc/extending.rst index d7595614c4..b1dbe5fa2b 100644 --- a/doc/extending.rst +++ b/doc/extending.rst @@ -8,7 +8,7 @@ Extending auto-sklearn auto-sklearn can be easily extended with new classification, regression and feature preprocessing methods. In order to do so, a user has to implement a -wrapper class and make it known to auto-sklearn. This manual will walk you +wrapper class and register it to auto-sklearn. This manual will walk you through the process. diff --git a/doc/manual.rst b/doc/manual.rst index 57cf43da45..60b5453de5 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -2,14 +2,15 @@ .. _manual: +====== Manual -~~~~~~ +====== This manual shows how to use several aspects of auto-sklearn. It either references the examples where possible or explains certain configurations. Restrict Searchspace -********************* +==================== Instead of using all available estimators, it is possible to restrict *auto-sklearn*'s searchspace. The following shows an example of how to exclude @@ -17,9 +18,9 @@ all preprocessing methods and restrict the configuration space to only random forests. >>> import autosklearn.classification ->>> automl = autosklearn.classification.AutoSklearnClassifier(include_estimators=["random_forest", ], ->>> exclude_estimators=None, include_preprocessors=["no_preprocessing", ], ->>> exclude_preprocessors=None) +>>> automl = autosklearn.classification.AutoSklearnClassifier( +>>> include_estimators=["random_forest", ], exclude_estimators=None, +>>> include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None) >>> cls.fit(X_train, y_train) >>> predictions = cls.predict(X_test, y_test) @@ -32,12 +33,12 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Preprocessors `_ Resampling strategies -********************* +===================== Examples for using holdout and cross-validation can be found in `auto-sklearn/examples/ `_ Parallel computation -******************** +==================== *auto-sklearn* supports parallel execution by data sharing on a shared file system. In this mode, the SMAC algorithm shares the training data for it's @@ -60,9 +61,25 @@ from `pypi` as a binary wheel (`see here `_ -from scikit-learn. \ No newline at end of file +from scikit-learn. + +Vanilla auto-sklearn +==================== + +In order to obtain *vanilla auto-sklearn* as used in `Efficient and Robust Automated Machine Learning +`_ +set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``: + +>>> import autosklearn.classification +>>> automl = autosklearn.classification.AutoSklearnClassifier( +>>> ensemble_size=1, initial_configurations_via_metalearning=0) + +An ensemble of size one will result in always choosing the current best model +according to its performance on the validation set. Setting the initial +configurations found by meta-learning to zero makes *auto-sklearn* use the +regular SMAC algorithm for suggesting new hyperparameter configurations. \ No newline at end of file From 26711a4c908e1749711a61aee53c3f5d6c425a02 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 12:45:53 +0200 Subject: [PATCH 02/12] FIX #174, document fit_ensemble() --- autosklearn/automl.py | 2 +- autosklearn/estimators.py | 43 +++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index fc13d23c0e..2528e27bf9 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -554,7 +554,7 @@ def predict(self, X, batch_size=None, n_jobs=1): predictions = self.ensemble_.predict(all_predictions) return predictions - def fit_ensemble(self, y, task=None, metric=None, precision='32', + def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index c3fa6b7abf..4921d0e470 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -50,7 +50,7 @@ def refit(self, X, y): """ return self._automl.refit(X, y) - def fit_ensemble(self, y, task=None, metric=None, precision='32', + def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): """Build the ensemble. @@ -62,7 +62,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32', self """ - return self._automl.fit_ensemble(y, task, metric, precision, + return self._automl.fit_ensemble(task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) @@ -302,12 +302,47 @@ def fit(self, *args, **kwargs): self._automl = self.build_automl() super(AutoSklearnEstimator, self).fit(*args, **kwargs) - def fit_ensemble(self, y, task=None, metric=None, precision='32', + def fit_ensemble(self, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): + """Fit an ensemble to models trained during an optimization process. + + All parameters are ``None`` by default. If no other value is given, + the default values which were set in a call to ``fit()`` are used. + + Parameters + ---------- + task : int + A constant from the module ``autosklearn.constants``. Determines + the task type (binary classification, multiclass classification, + multilabel classification or regression). + + metric : callable, optional (default='acc_metric') + An instance of ``autosklearn.metrics.Scorer``. + + precision : str + Numeric precision used when loading ensemble data. Can be either + ``'16'``, ``'32'`` or ``'64'``. + + dataset_name : str + Name of the current data set. + + ensemble_nbest : int + Determines how many models should be considered from the ensemble + building. This is inspired by a concept called library pruning + introduced in `Getting Most out of Ensemble Selection`. + + ensemble_size : int + Size of the ensemble built by `Ensomble Selection`. + + Returns + ------- + self + + """ if self._automl is None: self._automl = self.build_automl() - return self._automl.fit_ensemble(y, task, metric, precision, + return self._automl.fit_ensemble(task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) From 17cf2d7d3181ef2c5fc516581174fb1cc116ccde Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 12:57:32 +0200 Subject: [PATCH 03/12] FIX #70, document how to turn off preprocessing --- doc/manual.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/manual.rst b/doc/manual.rst index 60b5453de5..4c13927830 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -32,6 +32,18 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Regressors `_ * `Preprocessors `_ +Turning of preprocessing +~~~~~~~~~~~~~~~~~~~~~~~~ + +Preprocessing in *auto-sklearn* is divided into data preprocessing and +feature preprocessing. Data preprocessing includes One-Hot encoding of +categorical features, imputation of missing values and the normalization of +features or samples. These steps currently cannot be turned off. Feature +preprocessing is a single transformer which implements for example feature +selection or transformation of features into a different space (i.e. PCA). +This can be turned off by setting +``include_preprocessors=["no_preprocessing"]`` as shown in the example above. + Resampling strategies ===================== From 2e7bbe9f361cd3860c918637215c8aa2fcd40cac Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 13:07:29 +0200 Subject: [PATCH 04/12] Update documentation on categoricals in fit() for #121 --- autosklearn/estimators.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 4921d0e470..65e3cd100b 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -378,7 +378,10 @@ def fit(self, X, y, feat_type : list, optional (default=None) List of str of `len(X.shape[1])` describing the attribute type. Possible types are `Categorical` and `Numerical`. `Categorical` - attributes will be automatically One-Hot encoded. + attributes will be automatically One-Hot encoded. The values + used for a categorical attribute must be integers, obtainde for + example by `sklearn.preprocessing.LabelEncoder + `_. dataset_name : str, optional (default=None) Create nicer output. If None, a string will be determined by the From 9da64e43665e3c402a6fa04598f8c6096e461f50 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 13:57:11 +0200 Subject: [PATCH 05/12] FIX issue with y (target) in ensemble building --- autosklearn/automl.py | 2 +- autosklearn/estimators.py | 11 +++++++---- test/test_automl/test_estimators.py | 3 +-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 2528e27bf9..fc13d23c0e 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -554,7 +554,7 @@ def predict(self, X, batch_size=None, n_jobs=1): predictions = self.ensemble_.predict(all_predictions) return predictions - def fit_ensemble(self, task=None, metric=None, precision='32', + def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit']: diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 65e3cd100b..6c6b1a34e2 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -50,7 +50,7 @@ def refit(self, X, y): """ return self._automl.refit(X, y) - def fit_ensemble(self, task=None, metric=None, precision='32', + def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): """Build the ensemble. @@ -62,7 +62,7 @@ def fit_ensemble(self, task=None, metric=None, precision='32', self """ - return self._automl.fit_ensemble(task, metric, precision, + return self._automl.fit_ensemble(y, task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) @@ -302,7 +302,7 @@ def fit(self, *args, **kwargs): self._automl = self.build_automl() super(AutoSklearnEstimator, self).fit(*args, **kwargs) - def fit_ensemble(self, task=None, metric=None, precision='32', + def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): """Fit an ensemble to models trained during an optimization process. @@ -312,6 +312,9 @@ def fit_ensemble(self, task=None, metric=None, precision='32', Parameters ---------- + y : array-like + Target values. + task : int A constant from the module ``autosklearn.constants``. Determines the task type (binary classification, multiclass classification, @@ -342,7 +345,7 @@ def fit_ensemble(self, task=None, metric=None, precision='32', """ if self._automl is None: self._automl = self.build_automl() - return self._automl.fit_ensemble(task, metric, precision, + return self._automl.fit_ensemble(y, task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 4364b3bd34..4dfada97a7 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -141,8 +141,7 @@ def test_fit_pSMAC(self): seed=2, initial_configurations_via_metalearning=0, ensemble_size=0) - automl.fit_ensemble(Y_train, - task=MULTICLASS_CLASSIFICATION, + automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris', From c571a57e41224dcfaeff493dc8909a56912c7eb7 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 14:35:26 +0200 Subject: [PATCH 06/12] Put installation and contributors on own pages, add release notes --- doc/conf.py | 2 ++ doc/index.rst | 26 +------------------- doc/installation.rst | 28 +++++++++++++++++++++ doc/manual.rst | 12 +++++++++ doc/releases.rst | 58 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 101 insertions(+), 25 deletions(-) create mode 100644 doc/installation.rst create mode 100644 doc/releases.rst diff --git a/doc/conf.py b/doc/conf.py index afd7ed3c39..db6ee2c0fe 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -131,6 +131,8 @@ # be in the form [(name, page), ..] 'navbar_links': [ ('Start', 'index'), + ('Releases', 'releases'), + ('Installation', 'installation'), ('API', 'api'), ('Extending', 'extending'), ('Manual', 'manual'), diff --git a/doc/index.rst b/doc/index.rst index 9043858843..4ac66699e9 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -44,35 +44,13 @@ Example This will run for one hour should result in an accuracy above 0.98. -Installation -************ -**Prerequisities**: *auto-sklearn* is written in python and was developed -with Ubuntu. It should run on other Linux distributions, but won't work on a MAC -or on a windows PC. We aim to always support the two latests python versions, -which are 3.4 and 3.5 at the moment. It is built around scikit-learn 0.17.1 and -needs a compiler for C++ 11. - -Please install all dependencies manually with: - -.. code:: bash - - curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install - -Then install *auto-sklearn* - -.. code:: bash - - pip install auto-sklearn - -We recommend installing *auto-sklearn* into a `virtual environment -`_. - Manual ****** * :ref:`API` * :ref:`manual` * :ref:`extending` +* :ref:`installation` License @@ -140,5 +118,3 @@ Contributors * Frank Hutter * Diego Kobylkin * Marius Lindauer - - diff --git a/doc/installation.rst b/doc/installation.rst new file mode 100644 index 0000000000..7c8fdce8a9 --- /dev/null +++ b/doc/installation.rst @@ -0,0 +1,28 @@ +:orphan: + +.. _installation: + +============ +Installation +============ + +**Prerequisities**: *auto-sklearn* is written in python and was developed +with Ubuntu. It should run on other Linux distributions, but won't work on a MAC +or on a windows PC. We aim to always support the two latests python versions, +which are 3.4 and 3.5 at the moment. It is built around scikit-learn 0.17.1 and +needs a compiler for C++ 11. + +Please install all dependencies manually with: + +.. code:: bash + + curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install + +Then install *auto-sklearn* + +.. code:: bash + + pip install auto-sklearn + +We recommend installing *auto-sklearn* into a `virtual environment +`_. \ No newline at end of file diff --git a/doc/manual.rst b/doc/manual.rst index 4c13927830..da79c9834b 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -9,6 +9,18 @@ Manual This manual shows how to use several aspects of auto-sklearn. It either references the examples where possible or explains certain configurations. +Examples +======== + +*auto-sklearn* comes with the following examples which demonstrate several +aspects of its usage: + +* `Holdout `_ +* `Cross-validation `_ +* `Parallel usage `_ +* `Sequential usage `_ +* `Regression `_ + Restrict Searchspace ==================== diff --git a/doc/releases.rst b/doc/releases.rst new file mode 100644 index 0000000000..8689a38ae6 --- /dev/null +++ b/doc/releases.rst @@ -0,0 +1,58 @@ +:orphan: + +.. _releases: + +.. + The following command allows to retrieve all commiters since a specified + commit. From http://stackoverflow.com/questions/6482436/list-of-authors-in-git-since-a-given-commit + git log 2e29eba.. --format="%aN <%aE>" --reverse | perl -e 'my %dedupe; while () { print unless $dedupe{$_}++}' + +======== +Releases +======== + +Version 0.2.0 +============= + +Major changes +~~~~~~~~~~~~~ + +* **auto-sklearn supports custom metrics and all metrics included in + scikit-learn**. Different metrics can now be passed to the ``fit()``-method + estimator objects, for example + ``AutoSklearnClassifier.fit(metric='roc_auc')``. +* Upgrade to scikit-learn 0.18.1. +* Drop XGBoost as the latest release (0.6a2) does not work when spawned by + the pyninsher. + +Contributors +~~~~~~~~~~~~ + +* Matthias Feurer +* Katharina Eggensperger +* Laurent Sorber + +Version 0.1.x +============= + +There are no release notes for auto-sklearn prior to version 0.2.0. + +Contributors +~~~~~~~~~~~~ + +* Matthias Feurer +* Katharina Eggensperger +* Aaron Klein +* Jost Tobias Springenberg +* Anatolii Domashnev +* Stefan Falkner +* Alexander Sapronov +* Manuel Blum +* Diego Kobylkin +* Jaidev Deshpande +* Jongheon Jeong +* Hector Mendoza +* Timothy J Laurent +* Marius Lindauer +* _329_ +* Iver Jordal From b80fafc404f14ac2e9289c26038182209ea50e62 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 15:03:20 +0200 Subject: [PATCH 07/12] Resort pages in index, dedup contributors --- doc/conf.py | 2 +- doc/index.rst | 19 +------------------ 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index db6ee2c0fe..6b93d98691 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -133,9 +133,9 @@ ('Start', 'index'), ('Releases', 'releases'), ('Installation', 'installation'), + ('Manual', 'manual'), ('API', 'api'), ('Extending', 'extending'), - ('Manual', 'manual'), ], # Render the next and previous page links in navbar. (Default: true) diff --git a/doc/index.rst b/doc/index.rst index 4ac66699e9..72c125b66f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -47,8 +47,8 @@ This will run for one hour should result in an accuracy above 0.98. Manual ****** -* :ref:`API` * :ref:`manual` +* :ref:`API` * :ref:`extending` * :ref:`installation` @@ -101,20 +101,3 @@ which is marked with `Needs contributer`. When developing new features, please create a new branch from the development branch. Prior to submitting a pull request, make sure that all tests are still passing. - -Contributors -************ - -* Matthias Feurer -* Katharina Eggensperger -* Jost Tobias Springenberg -* Aaron Klein -* Anatolii Domashnev -* Alexander Sapronov -* Stefan Falkner -* Manuel Blum -* Hector Mendoza -* Farooq Ahmed Zuberi -* Frank Hutter -* Diego Kobylkin -* Marius Lindauer From 944192594dea35f464d0deb5e8710ae1c7f5aff8 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 15:04:06 +0200 Subject: [PATCH 08/12] add AnaConda installation tips --- doc/installation.rst | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 7c8fdce8a9..abb3db165f 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -6,11 +6,20 @@ Installation ============ -**Prerequisities**: *auto-sklearn* is written in python and was developed -with Ubuntu. It should run on other Linux distributions, but won't work on a MAC -or on a windows PC. We aim to always support the two latests python versions, -which are 3.4 and 3.5 at the moment. It is built around scikit-learn 0.17.1 and -needs a compiler for C++ 11. +System requirements +=================== + +auto-sklearn has the following system requirements: + +* Linux operating system (for example Ubuntu), +* Python (>=3.4). +* C++ compiler (which supports C++11) and SWIG + +For an explanation of missing Microsoft Windows and MAC OSX support please +check the Section `Windows/OSX compabilities`_. + +Python requirements +=================== Please install all dependencies manually with: @@ -25,4 +34,27 @@ Then install *auto-sklearn* pip install auto-sklearn We recommend installing *auto-sklearn* into a `virtual environment -`_. \ No newline at end of file +`_ or an `Anaconda +environment `_. + +Anaconda installation +===================== + +Anaconda does not ship *auto-sklearn*, and there are no conda packages for +*auto-sklearn*. Thus, it is easiest to install *auto-sklearn* as detailed in +the Section `Python requirements`_. + +A common installation problem under recent Linux distribution is the +incompability of the compiler version used to compile the Python binary +shipped by AnaConda and the compiler installed by the distribution. This can +be solved by istalling the *gcc* compiler shipped with AnaConda (as well as +*swig*): + +.. code:: bash + + conda install gcc swig + + +Windows/OSX compabilities +========================= + From 0f9d3e7299a04ebed211b0596bc1fa2063480f4e Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 15:39:34 +0200 Subject: [PATCH 09/12] osx and windows installation notes --- doc/installation.rst | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/installation.rst b/doc/installation.rst index abb3db165f..35b254e503 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -27,7 +27,7 @@ Please install all dependencies manually with: curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install -Then install *auto-sklearn* +Then install *auto-sklearn*: .. code:: bash @@ -58,3 +58,33 @@ be solved by istalling the *gcc* compiler shipped with AnaConda (as well as Windows/OSX compabilities ========================= +Windows +~~~~~~~ + +*auto-sklearn* relies heavily on the Python module ``resource``. ``resource`` +is part of Python's `Unix Specific Services `_ +and not available on a Windows machine. Therefore, it is not possible to run +*auto-sklearn* on a Windows machine. + +Possible solutions (not tested): + +* Windows 10 bash shell +* virtual machine +* docker image + +Mac OSX +~~~~~~~ + +Auto-sklearn is known to work on OSX systems. Nevertheless, there are two +issues holding us back from actively supporting OSX: + +* The ``resource`` module cannot enforce a memory limit on a Python process + (see `SMAC3/issues/115 `_). +* OSX machines on `travis-ci `_ take more than 30 + minutes to spawn. This makes it impossible for us to run unit tests for + *auto-sklearn* and its requirements. + +Possible solutions (not tested): + +* virtual machine +* docker image From dcf87bcf572ddc95dca690c5f87bb6070e32e8d7 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 10 May 2017 15:40:21 +0200 Subject: [PATCH 10/12] Fix typo --- doc/installation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/installation.rst b/doc/installation.rst index 35b254e503..f22ff4531e 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -55,8 +55,8 @@ be solved by istalling the *gcc* compiler shipped with AnaConda (as well as conda install gcc swig -Windows/OSX compabilities -========================= +Windows/OSX compability +======================= Windows ~~~~~~~ From c1c3de0ecf8d5495381fe79f48bf806a74cc68d0 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 11 May 2017 10:28:01 +0200 Subject: [PATCH 11/12] doc: add section on time and memory limits --- doc/manual.rst | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/manual.rst b/doc/manual.rst index da79c9834b..24ced5b8be 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -21,8 +21,28 @@ aspects of its usage: * `Sequential usage `_ * `Regression `_ -Restrict Searchspace -==================== +Time and memory limits +====================== + +A crucial feature of *auto-sklearn* is limiting the resources (memory and +time) which the scikit-learn algorithms are allowed to use. Especially for +large datasets, on which algorithms can take several hours and make the +machine swap, it is important to stop the evaluations after some time in order +to make progress in a reasonable amount of time. Setting the resource limits +is therefore a tradeoff between optimization time and the number of models +that can be tested. + +While *auto-sklearn* alleviates manual hyperparameter tuning, the user still +has to set memory and time limits. For most datasets a memory limit of 3GB or +6GB as found on most modern computers is sufficient. For the time limits it +is harder to give clear guidelines. If possible, a good default is a total +time limit of one day, and a time limit of 30 minutes for a single run. + +Further guidelines can be found in +`auto-sklearn/issues/142 `_. + +Restricting the Searchspace +=========================== Instead of using all available estimators, it is possible to restrict *auto-sklearn*'s searchspace. The following shows an example of how to exclude From d74add5b7251c3921cd25c8711924d9f7612f59b Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 11 May 2017 11:31:09 +0200 Subject: [PATCH 12/12] improve documentation --- autosklearn/estimators.py | 26 +++++++----- autosklearn/metrics/__init__.py | 8 ++-- doc/api.rst | 70 +++++++++++++++++++++++++++++++++ doc/index.rst | 10 ++--- doc/installation.rst | 13 +++--- doc/releases.rst | 2 + 6 files changed, 102 insertions(+), 27 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 6c6b1a34e2..e7de210dcf 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -53,15 +53,7 @@ def refit(self, X, y): def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): - """Build the ensemble. - This method only needs to be called in the parallel mode. - - Returns - ------- - self - - """ return self._automl.fit_ensemble(y, task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) @@ -225,6 +217,16 @@ def __init__(self, an ensemble. * ``'model'`` : do not save any model files + configuration_mode : ``SMAC`` or ``ROAR`` + Defines the configuration mode as described in the paper + `Sequential Model-Based Optimization for General Algorithm + Configuration `_: + + * ``SMAC`` (default): Sequential Model-based Algorithm + Configuration, which is a Bayesian optimization algorithm + * ``ROAR``: Random Online Aggressive Racing, which is basically + random search + Attributes ---------- @@ -321,7 +323,9 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32', multilabel classification or regression). metric : callable, optional (default='acc_metric') - An instance of ``autosklearn.metrics.Scorer``. + An instance of :class:`autosklearn.metrics.Scorer` as created by + :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in + Metrics`_. precision : str Numeric precision used when loading ensemble data. Can be either @@ -376,7 +380,9 @@ def fit(self, X, y, The target classes. metric : callable, optional (default='acc_metric') - An instance of ``autosklearn.metrics.Scorer``. + An instance of :class:`autosklearn.metrics.Scorer` as created by + :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in + Metrics`_. feat_type : list, optional (default=None) List of str of `len(X.shape[1])` describing the attribute type. diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index f581ded63b..83f324796c 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -137,12 +137,10 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. - Factory inspired by scikit-learn which wraps scoring functions to be used in - auto-sklearn. In difference to scikit-learn, auto-sklearn always needs to - call ``predict_proba`` in order to have predictions on a seperate validation - set to build ensembles with. + Factory inspired by scikit-learn which wraps scikit-learn scoring functions + to be used in auto-sklearn. - Paramaters + Parameters ---------- score_func : callable Score function (or loss function) with signature diff --git a/doc/api.rst b/doc/api.rst index 6f22fadccb..0e0bc48c14 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -5,9 +5,11 @@ APIs **** +============ Main modules ============ +~~~~~~~~~~~~~~ Classification ~~~~~~~~~~~~~~ @@ -15,6 +17,7 @@ Classification :members: :inherited-members: show_models, fit_ensemble, refit +~~~~~~~~~~ Regression ~~~~~~~~~~ @@ -22,6 +25,73 @@ Regression :members: :inherited-members: show_models, fit_ensemble, refit +======= +Metrics +======= + +.. autofunction:: autosklearn.metrics.make_scorer + +~~~~~~~~~~~~~~~~ +Built-in Metrics +~~~~~~~~~~~~~~~~ + +Classification +~~~~~~~~~~~~~~ + +.. autoclass:: autosklearn.metrics.accuracy + +.. autoclass:: autosklearn.metrics.balanced_accuracy + +.. autoclass:: autosklearn.metrics.f1 + +.. autoclass:: autosklearn.metrics.f1_macro + +.. autoclass:: autosklearn.metrics.f1_micro + +.. autoclass:: autosklearn.metrics.f1_samples + +.. autoclass:: autosklearn.metrics.f1_weighted + +.. autoclass:: autosklearn.metrics.roc_auc + +.. autoclass:: autosklearn.metrics.precision + +.. autoclass:: autosklearn.metrics.precision_macro + +.. autoclass:: autosklearn.metrics.precision_micro + +.. autoclass:: autosklearn.metrics.precision_samples + +.. autoclass:: autosklearn.metrics.precision_weighted + +.. autoclass:: autosklearn.metrics.average_precision + +.. autoclass:: autosklearn.metrics.recall + +.. autoclass:: autosklearn.metrics.recall_macro + +.. autoclass:: autosklearn.metrics.recall_micro + +.. autoclass:: autosklearn.metrics.recall_samples + +.. autoclass:: autosklearn.metrics.recall_weighted + +.. autoclass:: autosklearn.metrics.log_loss + +.. autoclass:: autosklearn.metrics.pac_score + +Regression +~~~~~~~~~~ + +.. autoclass:: autosklearn.metrics.r2 + +.. autoclass:: autosklearn.metrics.mean_squared_error + +.. autoclass:: autosklearn.metrics.mean_absolute_error + +.. autoclass:: autosklearn.metrics.median_absolute_error + +==================== Extension Interfaces ==================== diff --git a/doc/index.rst b/doc/index.rst index 72c125b66f..fddf413fa7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -30,9 +30,7 @@ Example >>> import sklearn.model_selection >>> import sklearn.datasets >>> import sklearn.metrics - >>> digits = sklearn.datasets.load_digits() - >>> X = digits.data - >>> y = digits.target + >>> X, y = sklearn.datasets.load_digits(return_X_y=True) >>> X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) >>> automl = autosklearn.classification.AutoSklearnClassifier() @@ -47,10 +45,10 @@ This will run for one hour should result in an accuracy above 0.98. Manual ****** +* :ref:`installation` * :ref:`manual` * :ref:`API` * :ref:`extending` -* :ref:`installation` License @@ -86,7 +84,7 @@ references to the following paper: Contributing ************ -We appreciate all contribution to auto-sklearn, from bug reports, +We appreciate all contribution to auto-sklearn, from bug reports and documentation to new features. If you want to contribute to the code, you can pick an issue from the `issue tracker `_ which is marked with `Needs contributer`. @@ -99,5 +97,5 @@ which is marked with `Needs contributer`. .com/automl/auto-sklearn/issues>`_ before starting to work. When developing new features, please create a new branch from the development -branch. Prior to submitting a pull request, make sure that all tests are +branch. When to submitting a pull request, make sure that all tests are still passing. diff --git a/doc/installation.rst b/doc/installation.rst index f22ff4531e..e56c80078b 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -13,13 +13,13 @@ auto-sklearn has the following system requirements: * Linux operating system (for example Ubuntu), * Python (>=3.4). -* C++ compiler (which supports C++11) and SWIG +* C++ compiler (with C++11 supports) and SWIG For an explanation of missing Microsoft Windows and MAC OSX support please -check the Section `Windows/OSX compabilities`_. +check the Section `Windows/OSX compability`_. -Python requirements -=================== +Installing auto-sklearn +======================= Please install all dependencies manually with: @@ -42,7 +42,7 @@ Anaconda installation Anaconda does not ship *auto-sklearn*, and there are no conda packages for *auto-sklearn*. Thus, it is easiest to install *auto-sklearn* as detailed in -the Section `Python requirements`_. +the Section `Installing auto-sklearn`_. A common installation problem under recent Linux distribution is the incompability of the compiler version used to compile the Python binary @@ -82,7 +82,8 @@ issues holding us back from actively supporting OSX: (see `SMAC3/issues/115 `_). * OSX machines on `travis-ci `_ take more than 30 minutes to spawn. This makes it impossible for us to run unit tests for - *auto-sklearn* and its requirements. + *auto-sklearn* and its dependencies `SMAC3 `_ + and `ConfigSpace `_. Possible solutions (not tested): diff --git a/doc/releases.rst b/doc/releases.rst index 8689a38ae6..d2dcd15ce3 100644 --- a/doc/releases.rst +++ b/doc/releases.rst @@ -24,6 +24,8 @@ Major changes * Upgrade to scikit-learn 0.18.1. * Drop XGBoost as the latest release (0.6a2) does not work when spawned by the pyninsher. +* *auto-sklearn* can use multiprocessing in calls to ``predict()`` and + ``predict_proba``. By `Laurent Sorber `_. Contributors ~~~~~~~~~~~~