diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index e7de210dcf..1fcc86c481 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -14,7 +14,6 @@ from autosklearn.util.backend import create - class AutoMLDecorator(object): def __init__(self, automl): @@ -379,7 +378,7 @@ def fit(self, X, y, y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target classes. - metric : callable, optional (default='acc_metric') + metric : callable, optional (default='autosklearn.metrics.accuracy') An instance of :class:`autosklearn.metrics.Scorer` as created by :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in Metrics`_. @@ -388,7 +387,7 @@ def fit(self, X, y, List of str of `len(X.shape[1])` describing the attribute type. Possible types are `Categorical` and `Numerical`. `Categorical` attributes will be automatically One-Hot encoded. The values - used for a categorical attribute must be integers, obtainde for + used for a categorical attribute must be integers, obtained for example by `sklearn.preprocessing.LabelEncoder `_. @@ -464,11 +463,10 @@ def fit(self, X, y, y : array-like, shape = [n_samples] or [n_samples, n_outputs] The regression target. - metric : str, optional (default='r2_metric') - The metric to optimize for. Can be one of: ['r2_metric', - 'a_metric']. A description of the metrics can be found in - `the paper describing the AutoML Challenge - `_. + metric : callable, optional (default='autosklearn.metrics.accuracy') + An instance of :class:`autosklearn.metrics.Scorer` as created by + :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in + Metrics`_. feat_type : list, optional (default=None) List of str of `len(X.shape[1])` describing the attribute type. @@ -541,7 +539,7 @@ def fit(self, X, y, if task == MULTILABEL_CLASSIFICATION: metric = f1_macro else: - metric=accuracy + metric = accuracy y = self._process_target_classes(y) @@ -586,7 +584,6 @@ def _process_target_classes(self, y): return y - def predict(self, X, batch_size=None, n_jobs=1): predicted_probabilities = self._automl.predict( X, batch_size=batch_size, n_jobs=n_jobs) diff --git a/doc/manual.rst b/doc/manual.rst index 24ced5b8be..d21bb1412a 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -20,6 +20,9 @@ aspects of its usage: * `Parallel usage `_ * `Sequential usage `_ * `Regression `_ +* `Continuous and Categorical Data `_ +* `Using Custom metrics `_ + Time and memory limits ====================== @@ -64,7 +67,7 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Regressors `_ * `Preprocessors `_ -Turning of preprocessing +Turning off preprocessing ~~~~~~~~~~~~~~~~~~~~~~~~ Preprocessing in *auto-sklearn* is divided into data preprocessing and diff --git a/example/example_feature_types.py b/example/example_feature_types.py new file mode 100644 index 0000000000..3c6e78ec8d --- /dev/null +++ b/example/example_feature_types.py @@ -0,0 +1,57 @@ +# -*- encoding: utf-8 -*- +import sklearn.model_selection +import sklearn.datasets +import sklearn.metrics + +import autosklearn.classification + +try: + import openml +except ImportError: + print("#"*80 + """ + To run this example you need to install openml-python: + + git+https://github.com/renatopp/liac-arff + # OpenML is currently not on pypi, use an old version to not depend on + # scikit-learn 0.18 + requests + xmltodict + git+https://github.com/renatopp/liac-arff + git+https://github.com/openml/""" + + "openml-python@0b9009b0436fda77d9f7c701bd116aff4158d5e1\n""" + + "#"*80) + raise + + +def main(): + # Load adult dataset from openml.org, see https://www.openml.org/t/2117 + openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de' + + task = openml.tasks.get_task(2117) + train_indices, test_indices = task.get_train_test_split_indices() + X, y = task.get_X_and_y() + + X_train = X[train_indices] + y_train = y[train_indices] + X_test = X[test_indices] + y_test = y[test_indices] + + dataset = task.get_dataset() + _, _, categorical_indicator = dataset.\ + get_data(target=task.target_name, return_categorical_indicator=True) + + # Create feature type list from openml.org indicator and run autosklearn + feat_type = ['categorical' if ci else 'numerical' + for ci in categorical_indicator] + + cls = autosklearn.classification.\ + AutoSklearnClassifier(time_left_for_this_task=120, + per_run_time_limit=30) + cls.fit(X_train, y_train, feat_type=feat_type) + + predictions = cls.predict(X_test) + print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) + + +if __name__ == "__main__": + main() diff --git a/example/example_metrics.py b/example/example_metrics.py new file mode 100644 index 0000000000..596492f3e4 --- /dev/null +++ b/example/example_metrics.py @@ -0,0 +1,119 @@ +# -*- encoding: utf-8 -*- +import numpy as np + +import sklearn.model_selection +import sklearn.datasets +import sklearn.metrics + +import autosklearn.classification +import autosklearn.metrics + +try: + import openml +except ImportError: + print("#"*80 + """ + To run this example you need to install openml-python: + + git+https://github.com/renatopp/liac-arff + # OpenML is currently not on pypi, use an old version to not depend on + # scikit-learn 0.18 + requests + xmltodict + git+https://github.com/renatopp/liac-arff + git+https://github.com/openml/""" + + "openml-python@0b9009b0436fda77d9f7c701bd116aff4158d5e1\n""" + + "#"*80) + raise + + +def accuracy(solution, prediction): + # function defining accuracy + return np.mean(solution == prediction) + + +def accuracy_wk(solution, prediction, dummy): + # function defining accuracy and accepting an additional argument + assert dummy is None + return np.mean(solution == prediction) + + +def main(): + # Load adult dataset from openml.org, see https://www.openml.org/t/2117 + openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de' + + task = openml.tasks.get_task(2117) + train_indices, test_indices = task.get_train_test_split_indices() + X, y = task.get_X_and_y() + + X_train = X[train_indices] + y_train = y[train_indices] + X_test = X[test_indices] + y_test = y[test_indices] + + dataset = task.get_dataset() + _, _, categorical_indicator = dataset.\ + get_data(target=task.target_name, return_categorical_indicator=True) + + # Create feature type list from openml.org indicator and run autosklearn + feat_type = ['categorical' if ci else 'numerical' + for ci in categorical_indicator] + + # Print a list of available metrics + print("Available CLASSIFICATION metrics autosklearn.metrics.*:") + print("\t*" + "\n\t*".join(autosklearn.metrics.CLASSIFICATION_METRICS)) + + print("Available REGRESSION autosklearn.metrics.*:") + print("\t*" + "\n\t*".join(autosklearn.metrics.REGRESSION_METRICS)) + + # First example: Use predefined accuracy metric + print("#"*80) + print("Use predefined accuracy metric") + cls = autosklearn.classification.\ + AutoSklearnClassifier(time_left_for_this_task=60, + per_run_time_limit=30, seed=1) + cls.fit(X_train, y_train, feat_type=feat_type, + metric=autosklearn.metrics.accuracy) + + predictions = cls.predict(X_test) + print("Accuracy score {:g} using {:s}". + format(sklearn.metrics.accuracy_score(y_test, predictions), + cls._automl._automl._metric.name)) + + print("#"*80) + print("Use self defined accuracy accuracy metric") + accuracy_scorer = autosklearn.metrics.make_scorer(name="accu", + score_func=accuracy, + greater_is_better=True, + needs_proba=False, + needs_threshold=False) + cls = autosklearn.classification.\ + AutoSklearnClassifier(time_left_for_this_task=60, + per_run_time_limit=30, seed=1) + cls.fit(X_train, y_train, feat_type=feat_type, metric=accuracy_scorer) + + predictions = cls.predict(X_test) + print("Accuracy score {:g} using {:s}". + format(sklearn.metrics.accuracy_score(y_test, predictions), + cls._automl._automl._metric.name)) + + print("#"*80) + print("Use self defined accuracy with additional argument") + accuracy_scorer = autosklearn.metrics.make_scorer(name="accu_add", + score_func=accuracy_wk, + greater_is_better=True, + needs_proba=False, + needs_threshold=False, + dummy=None) + cls = autosklearn.classification.\ + AutoSklearnClassifier(time_left_for_this_task=60, + per_run_time_limit=30, seed=1) + cls.fit(X_train, y_train, feat_type=feat_type, metric=accuracy_scorer) + + predictions = cls.predict(X_test) + print("Accuracy score {:g} using {:s}". + format(sklearn.metrics.accuracy_score(y_test, predictions), + cls._automl._automl._metric.name)) + + +if __name__ == "__main__": + main()