diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index c3fa6b7abf..e7de210dcf 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -53,15 +53,7 @@ def refit(self, X, y): def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): - """Build the ensemble. - This method only needs to be called in the parallel mode. - - Returns - ------- - self - - """ return self._automl.fit_ensemble(y, task, metric, precision, dataset_name, ensemble_nbest, ensemble_size) @@ -225,6 +217,16 @@ def __init__(self, an ensemble. * ``'model'`` : do not save any model files + configuration_mode : ``SMAC`` or ``ROAR`` + Defines the configuration mode as described in the paper + `Sequential Model-Based Optimization for General Algorithm + Configuration `_: + + * ``SMAC`` (default): Sequential Model-based Algorithm + Configuration, which is a Bayesian optimization algorithm + * ``ROAR``: Random Online Aggressive Racing, which is basically + random search + Attributes ---------- @@ -305,6 +307,46 @@ def fit(self, *args, **kwargs): def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, ensemble_size=None): + """Fit an ensemble to models trained during an optimization process. + + All parameters are ``None`` by default. If no other value is given, + the default values which were set in a call to ``fit()`` are used. + + Parameters + ---------- + y : array-like + Target values. + + task : int + A constant from the module ``autosklearn.constants``. Determines + the task type (binary classification, multiclass classification, + multilabel classification or regression). + + metric : callable, optional (default='acc_metric') + An instance of :class:`autosklearn.metrics.Scorer` as created by + :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in + Metrics`_. + + precision : str + Numeric precision used when loading ensemble data. Can be either + ``'16'``, ``'32'`` or ``'64'``. + + dataset_name : str + Name of the current data set. + + ensemble_nbest : int + Determines how many models should be considered from the ensemble + building. This is inspired by a concept called library pruning + introduced in `Getting Most out of Ensemble Selection`. + + ensemble_size : int + Size of the ensemble built by `Ensomble Selection`. + + Returns + ------- + self + + """ if self._automl is None: self._automl = self.build_automl() return self._automl.fit_ensemble(y, task, metric, precision, @@ -338,12 +380,17 @@ def fit(self, X, y, The target classes. metric : callable, optional (default='acc_metric') - An instance of ``autosklearn.metrics.Scorer``. + An instance of :class:`autosklearn.metrics.Scorer` as created by + :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in + Metrics`_. feat_type : list, optional (default=None) List of str of `len(X.shape[1])` describing the attribute type. Possible types are `Categorical` and `Numerical`. `Categorical` - attributes will be automatically One-Hot encoded. + attributes will be automatically One-Hot encoded. The values + used for a categorical attribute must be integers, obtainde for + example by `sklearn.preprocessing.LabelEncoder + `_. dataset_name : str, optional (default=None) Create nicer output. If None, a string will be determined by the diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index f581ded63b..83f324796c 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -137,12 +137,10 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. - Factory inspired by scikit-learn which wraps scoring functions to be used in - auto-sklearn. In difference to scikit-learn, auto-sklearn always needs to - call ``predict_proba`` in order to have predictions on a seperate validation - set to build ensembles with. + Factory inspired by scikit-learn which wraps scikit-learn scoring functions + to be used in auto-sklearn. - Paramaters + Parameters ---------- score_func : callable Score function (or loss function) with signature diff --git a/doc/api.rst b/doc/api.rst index 6f22fadccb..0e0bc48c14 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -5,9 +5,11 @@ APIs **** +============ Main modules ============ +~~~~~~~~~~~~~~ Classification ~~~~~~~~~~~~~~ @@ -15,6 +17,7 @@ Classification :members: :inherited-members: show_models, fit_ensemble, refit +~~~~~~~~~~ Regression ~~~~~~~~~~ @@ -22,6 +25,73 @@ Regression :members: :inherited-members: show_models, fit_ensemble, refit +======= +Metrics +======= + +.. autofunction:: autosklearn.metrics.make_scorer + +~~~~~~~~~~~~~~~~ +Built-in Metrics +~~~~~~~~~~~~~~~~ + +Classification +~~~~~~~~~~~~~~ + +.. autoclass:: autosklearn.metrics.accuracy + +.. autoclass:: autosklearn.metrics.balanced_accuracy + +.. autoclass:: autosklearn.metrics.f1 + +.. autoclass:: autosklearn.metrics.f1_macro + +.. autoclass:: autosklearn.metrics.f1_micro + +.. autoclass:: autosklearn.metrics.f1_samples + +.. autoclass:: autosklearn.metrics.f1_weighted + +.. autoclass:: autosklearn.metrics.roc_auc + +.. autoclass:: autosklearn.metrics.precision + +.. autoclass:: autosklearn.metrics.precision_macro + +.. autoclass:: autosklearn.metrics.precision_micro + +.. autoclass:: autosklearn.metrics.precision_samples + +.. autoclass:: autosklearn.metrics.precision_weighted + +.. autoclass:: autosklearn.metrics.average_precision + +.. autoclass:: autosklearn.metrics.recall + +.. autoclass:: autosklearn.metrics.recall_macro + +.. autoclass:: autosklearn.metrics.recall_micro + +.. autoclass:: autosklearn.metrics.recall_samples + +.. autoclass:: autosklearn.metrics.recall_weighted + +.. autoclass:: autosklearn.metrics.log_loss + +.. autoclass:: autosklearn.metrics.pac_score + +Regression +~~~~~~~~~~ + +.. autoclass:: autosklearn.metrics.r2 + +.. autoclass:: autosklearn.metrics.mean_squared_error + +.. autoclass:: autosklearn.metrics.mean_absolute_error + +.. autoclass:: autosklearn.metrics.median_absolute_error + +==================== Extension Interfaces ==================== diff --git a/doc/conf.py b/doc/conf.py index afd7ed3c39..6b93d98691 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -131,9 +131,11 @@ # be in the form [(name, page), ..] 'navbar_links': [ ('Start', 'index'), + ('Releases', 'releases'), + ('Installation', 'installation'), + ('Manual', 'manual'), ('API', 'api'), ('Extending', 'extending'), - ('Manual', 'manual'), ], # Render the next and previous page links in navbar. (Default: true) diff --git a/doc/extending.rst b/doc/extending.rst index d7595614c4..b1dbe5fa2b 100644 --- a/doc/extending.rst +++ b/doc/extending.rst @@ -8,7 +8,7 @@ Extending auto-sklearn auto-sklearn can be easily extended with new classification, regression and feature preprocessing methods. In order to do so, a user has to implement a -wrapper class and make it known to auto-sklearn. This manual will walk you +wrapper class and register it to auto-sklearn. This manual will walk you through the process. diff --git a/doc/index.rst b/doc/index.rst index 9043858843..fddf413fa7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -30,9 +30,7 @@ Example >>> import sklearn.model_selection >>> import sklearn.datasets >>> import sklearn.metrics - >>> digits = sklearn.datasets.load_digits() - >>> X = digits.data - >>> y = digits.target + >>> X, y = sklearn.datasets.load_digits(return_X_y=True) >>> X_train, X_test, y_train, y_test = \ sklearn.model_selection.train_test_split(X, y, random_state=1) >>> automl = autosklearn.classification.AutoSklearnClassifier() @@ -44,34 +42,12 @@ Example This will run for one hour should result in an accuracy above 0.98. -Installation -************ -**Prerequisities**: *auto-sklearn* is written in python and was developed -with Ubuntu. It should run on other Linux distributions, but won't work on a MAC -or on a windows PC. We aim to always support the two latests python versions, -which are 3.4 and 3.5 at the moment. It is built around scikit-learn 0.17.1 and -needs a compiler for C++ 11. - -Please install all dependencies manually with: - -.. code:: bash - - curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install - -Then install *auto-sklearn* - -.. code:: bash - - pip install auto-sklearn - -We recommend installing *auto-sklearn* into a `virtual environment -`_. - Manual ****** -* :ref:`API` +* :ref:`installation` * :ref:`manual` +* :ref:`API` * :ref:`extending` @@ -108,7 +84,7 @@ references to the following paper: Contributing ************ -We appreciate all contribution to auto-sklearn, from bug reports, +We appreciate all contribution to auto-sklearn, from bug reports and documentation to new features. If you want to contribute to the code, you can pick an issue from the `issue tracker `_ which is marked with `Needs contributer`. @@ -121,24 +97,5 @@ which is marked with `Needs contributer`. .com/automl/auto-sklearn/issues>`_ before starting to work. When developing new features, please create a new branch from the development -branch. Prior to submitting a pull request, make sure that all tests are +branch. When to submitting a pull request, make sure that all tests are still passing. - -Contributors -************ - -* Matthias Feurer -* Katharina Eggensperger -* Jost Tobias Springenberg -* Aaron Klein -* Anatolii Domashnev -* Alexander Sapronov -* Stefan Falkner -* Manuel Blum -* Hector Mendoza -* Farooq Ahmed Zuberi -* Frank Hutter -* Diego Kobylkin -* Marius Lindauer - - diff --git a/doc/installation.rst b/doc/installation.rst new file mode 100644 index 0000000000..e56c80078b --- /dev/null +++ b/doc/installation.rst @@ -0,0 +1,91 @@ +:orphan: + +.. _installation: + +============ +Installation +============ + +System requirements +=================== + +auto-sklearn has the following system requirements: + +* Linux operating system (for example Ubuntu), +* Python (>=3.4). +* C++ compiler (with C++11 supports) and SWIG + +For an explanation of missing Microsoft Windows and MAC OSX support please +check the Section `Windows/OSX compability`_. + +Installing auto-sklearn +======================= + +Please install all dependencies manually with: + +.. code:: bash + + curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install + +Then install *auto-sklearn*: + +.. code:: bash + + pip install auto-sklearn + +We recommend installing *auto-sklearn* into a `virtual environment +`_ or an `Anaconda +environment `_. + +Anaconda installation +===================== + +Anaconda does not ship *auto-sklearn*, and there are no conda packages for +*auto-sklearn*. Thus, it is easiest to install *auto-sklearn* as detailed in +the Section `Installing auto-sklearn`_. + +A common installation problem under recent Linux distribution is the +incompability of the compiler version used to compile the Python binary +shipped by AnaConda and the compiler installed by the distribution. This can +be solved by istalling the *gcc* compiler shipped with AnaConda (as well as +*swig*): + +.. code:: bash + + conda install gcc swig + + +Windows/OSX compability +======================= + +Windows +~~~~~~~ + +*auto-sklearn* relies heavily on the Python module ``resource``. ``resource`` +is part of Python's `Unix Specific Services `_ +and not available on a Windows machine. Therefore, it is not possible to run +*auto-sklearn* on a Windows machine. + +Possible solutions (not tested): + +* Windows 10 bash shell +* virtual machine +* docker image + +Mac OSX +~~~~~~~ + +Auto-sklearn is known to work on OSX systems. Nevertheless, there are two +issues holding us back from actively supporting OSX: + +* The ``resource`` module cannot enforce a memory limit on a Python process + (see `SMAC3/issues/115 `_). +* OSX machines on `travis-ci `_ take more than 30 + minutes to spawn. This makes it impossible for us to run unit tests for + *auto-sklearn* and its dependencies `SMAC3 `_ + and `ConfigSpace `_. + +Possible solutions (not tested): + +* virtual machine +* docker image diff --git a/doc/manual.rst b/doc/manual.rst index 57cf43da45..24ced5b8be 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -2,14 +2,47 @@ .. _manual: +====== Manual -~~~~~~ +====== This manual shows how to use several aspects of auto-sklearn. It either references the examples where possible or explains certain configurations. -Restrict Searchspace -********************* +Examples +======== + +*auto-sklearn* comes with the following examples which demonstrate several +aspects of its usage: + +* `Holdout `_ +* `Cross-validation `_ +* `Parallel usage `_ +* `Sequential usage `_ +* `Regression `_ + +Time and memory limits +====================== + +A crucial feature of *auto-sklearn* is limiting the resources (memory and +time) which the scikit-learn algorithms are allowed to use. Especially for +large datasets, on which algorithms can take several hours and make the +machine swap, it is important to stop the evaluations after some time in order +to make progress in a reasonable amount of time. Setting the resource limits +is therefore a tradeoff between optimization time and the number of models +that can be tested. + +While *auto-sklearn* alleviates manual hyperparameter tuning, the user still +has to set memory and time limits. For most datasets a memory limit of 3GB or +6GB as found on most modern computers is sufficient. For the time limits it +is harder to give clear guidelines. If possible, a good default is a total +time limit of one day, and a time limit of 30 minutes for a single run. + +Further guidelines can be found in +`auto-sklearn/issues/142 `_. + +Restricting the Searchspace +=========================== Instead of using all available estimators, it is possible to restrict *auto-sklearn*'s searchspace. The following shows an example of how to exclude @@ -17,9 +50,9 @@ all preprocessing methods and restrict the configuration space to only random forests. >>> import autosklearn.classification ->>> automl = autosklearn.classification.AutoSklearnClassifier(include_estimators=["random_forest", ], ->>> exclude_estimators=None, include_preprocessors=["no_preprocessing", ], ->>> exclude_preprocessors=None) +>>> automl = autosklearn.classification.AutoSklearnClassifier( +>>> include_estimators=["random_forest", ], exclude_estimators=None, +>>> include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None) >>> cls.fit(X_train, y_train) >>> predictions = cls.predict(X_test, y_test) @@ -31,13 +64,25 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Regressors `_ * `Preprocessors `_ +Turning of preprocessing +~~~~~~~~~~~~~~~~~~~~~~~~ + +Preprocessing in *auto-sklearn* is divided into data preprocessing and +feature preprocessing. Data preprocessing includes One-Hot encoding of +categorical features, imputation of missing values and the normalization of +features or samples. These steps currently cannot be turned off. Feature +preprocessing is a single transformer which implements for example feature +selection or transformation of features into a different space (i.e. PCA). +This can be turned off by setting +``include_preprocessors=["no_preprocessing"]`` as shown in the example above. + Resampling strategies -********************* +===================== Examples for using holdout and cross-validation can be found in `auto-sklearn/examples/ `_ Parallel computation -******************** +==================== *auto-sklearn* supports parallel execution by data sharing on a shared file system. In this mode, the SMAC algorithm shares the training data for it's @@ -60,9 +105,25 @@ from `pypi` as a binary wheel (`see here `_ -from scikit-learn. \ No newline at end of file +from scikit-learn. + +Vanilla auto-sklearn +==================== + +In order to obtain *vanilla auto-sklearn* as used in `Efficient and Robust Automated Machine Learning +`_ +set ``ensemble_size=1`` and ``initial_configurations_via_metalearning=0``: + +>>> import autosklearn.classification +>>> automl = autosklearn.classification.AutoSklearnClassifier( +>>> ensemble_size=1, initial_configurations_via_metalearning=0) + +An ensemble of size one will result in always choosing the current best model +according to its performance on the validation set. Setting the initial +configurations found by meta-learning to zero makes *auto-sklearn* use the +regular SMAC algorithm for suggesting new hyperparameter configurations. \ No newline at end of file diff --git a/doc/releases.rst b/doc/releases.rst new file mode 100644 index 0000000000..d2dcd15ce3 --- /dev/null +++ b/doc/releases.rst @@ -0,0 +1,60 @@ +:orphan: + +.. _releases: + +.. + The following command allows to retrieve all commiters since a specified + commit. From http://stackoverflow.com/questions/6482436/list-of-authors-in-git-since-a-given-commit + git log 2e29eba.. --format="%aN <%aE>" --reverse | perl -e 'my %dedupe; while () { print unless $dedupe{$_}++}' + +======== +Releases +======== + +Version 0.2.0 +============= + +Major changes +~~~~~~~~~~~~~ + +* **auto-sklearn supports custom metrics and all metrics included in + scikit-learn**. Different metrics can now be passed to the ``fit()``-method + estimator objects, for example + ``AutoSklearnClassifier.fit(metric='roc_auc')``. +* Upgrade to scikit-learn 0.18.1. +* Drop XGBoost as the latest release (0.6a2) does not work when spawned by + the pyninsher. +* *auto-sklearn* can use multiprocessing in calls to ``predict()`` and + ``predict_proba``. By `Laurent Sorber `_. + +Contributors +~~~~~~~~~~~~ + +* Matthias Feurer +* Katharina Eggensperger +* Laurent Sorber + +Version 0.1.x +============= + +There are no release notes for auto-sklearn prior to version 0.2.0. + +Contributors +~~~~~~~~~~~~ + +* Matthias Feurer +* Katharina Eggensperger +* Aaron Klein +* Jost Tobias Springenberg +* Anatolii Domashnev +* Stefan Falkner +* Alexander Sapronov +* Manuel Blum +* Diego Kobylkin +* Jaidev Deshpande +* Jongheon Jeong +* Hector Mendoza +* Timothy J Laurent +* Marius Lindauer +* _329_ +* Iver Jordal diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 4364b3bd34..4dfada97a7 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -141,8 +141,7 @@ def test_fit_pSMAC(self): seed=2, initial_configurations_via_metalearning=0, ensemble_size=0) - automl.fit_ensemble(Y_train, - task=MULTICLASS_CLASSIFICATION, + automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION, metric=accuracy, precision='32', dataset_name='iris',