Skip to content

Commit

Permalink
Ravin Kohli: [RELEASE] Release v0.2 (#448)
Browse files Browse the repository at this point in the history
  • Loading branch information
Github Actions committed Jul 18, 2022
1 parent 2faea3f commit e141e4c
Show file tree
Hide file tree
Showing 106 changed files with 8,938 additions and 32,219 deletions.
2 changes: 1 addition & 1 deletion master/.buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 970eb458f32b434db48854a3317b2f05
config: 29bd076c4adc563a4d280f931be44bec
tags: 645f666f9bcd5a90fca523b33c5a78b7
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
==============================
Plot the Performance over Time
==============================
Auto-Pytorch uses SMAC to fit individual machine learning algorithms
and then ensembles them together using `Ensemble Selection
<https://www.cs.cornell.edu/~caruana/ctp/ct.papers/caruana.icml04.icdm06long.pdf>`_.
The following examples shows how to plot both the performance
of the individual models and their respective ensemble.
Additionally, as we are compatible with matplotlib,
you can input any args or kwargs that are compatible with ax.plot.
In the case when you would like to create multipanel visualization,
please input plt.Axes obtained from matplotlib.pyplot.subplots.
"""
import warnings

import numpy as np
import pandas as pd

from sklearn import model_selection

import matplotlib.pyplot as plt

from autoPyTorch.api.tabular_classification import TabularClassificationTask
from autoPyTorch.utils.results_visualizer import PlotSettingParams


warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


############################################################################
# Task Definition
# ===============
n_samples, dim = 100, 2
X = np.random.random((n_samples, dim)) * 2 - 1
y = ((X ** 2).sum(axis=-1) < 2 / np.pi).astype(np.int32)
print(y)

X, y = pd.DataFrame(X), pd.DataFrame(y)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

############################################################################
# API Instantiation and Searching
# ===============================
api = TabularClassificationTask(seed=42)

api.search(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
optimize_metric='accuracy', total_walltime_limit=120, func_eval_time_limit_secs=10)

############################################################################
# Create Setting Parameters Object
# ================================
metric_name = 'accuracy'

params = PlotSettingParams(
xscale='log',
xlabel='Runtime',
ylabel='Accuracy',
title='Toy Example',
figname='example_plot_over_time.png',
savefig_kwargs={'bbox_inches': 'tight'},
show=False # If you would like to show, make it True and set figname=None
)

############################################################################
# Plot with the Specified Setting Parameters
# ==========================================
# _, ax = plt.subplots() <=== You can feed it to post-process the figure.

# You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment.
api.plot_perf_over_time(
metric_name=metric_name,
plot_setting_params=params,
marker='*',
markersize=10
)
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@

############################################################################
# Search for an ensemble of machine learning algorithms
# -----------------------------------------------------------------------
# -----------------------------------------------------

api.search(
X_train=X_train,
Expand All @@ -107,7 +107,7 @@

############################################################################
# Print the final ensemble performance
# ------------
# ------------------------------------
y_pred = api.predict(X_test)
score = api.score(y_pred, y_test)
print(score)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
},
"outputs": [],
"source": [
"# We will plot the search incumbent through time.\n\n# Collect the performance of individual machine learning algorithms\n# found by SMAC\nindividual_performances = []\nfor run_key, run_value in estimator.run_history.data.items():\n if run_value.status != StatusType.SUCCESS:\n # Ignore crashed runs\n continue\n individual_performances.append({\n 'Timestamp': pd.Timestamp(\n time.strftime(\n '%Y-%m-%d %H:%M:%S',\n time.localtime(run_value.endtime)\n )\n ),\n 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost,\n 'single_best_test_accuracy': np.nan if run_value.additional_info is None else\n accuracy._optimum - run_value.additional_info['test_loss']['accuracy'],\n })\nindividual_performance_frame = pd.DataFrame(individual_performances)\n\n# Collect the performance of the ensemble through time\n# This ensemble is built from the machine learning algorithms\n# found by SMAC\nensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history)\n\n# As we are tracking the incumbent, we are interested in the cummax() performance\nensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[\n 'train_accuracy'\n].cummax()\nensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[\n 'test_accuracy'\n].cummax()\nensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True)\nindividual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[\n 'single_best_optimization_accuracy'\n].cummax()\nindividual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[\n 'single_best_test_accuracy'\n].cummax()\n\npd.merge(\n ensemble_performance_frame,\n individual_performance_frame,\n on=\"Timestamp\", how='outer'\n).sort_values('Timestamp').fillna(method='ffill').plot(\n x='Timestamp',\n kind='line',\n legend=True,\n title='Auto-PyTorch accuracy over time',\n grid=True,\n)\nplt.show()\n\n# We then can understand the importance of each input feature using\n# a permutation importance analysis. This is done as a proof of concept, to\n# showcase that we can leverage of scikit-learn API.\nresult = permutation_importance(estimator, X_train, y_train, n_repeats=5,\n scoring='accuracy',\n random_state=seed)\nsorted_idx = result.importances_mean.argsort()\n\nfig, ax = plt.subplots()\nax.boxplot(result.importances[sorted_idx].T,\n vert=False, labels=X_test.columns[sorted_idx])\nax.set_title(\"Permutation Importances (Train set)\")\nfig.tight_layout()\nplt.show()"
"# We will plot the search incumbent through time.\n\n# Collect the performance of individual machine learning algorithms\n# found by SMAC\nindividual_performances = []\nfor run_key, run_value in estimator.run_history.data.items():\n if run_value.status != StatusType.SUCCESS:\n # Ignore crashed runs\n continue\n individual_performances.append({\n 'Timestamp': pd.Timestamp(\n time.strftime(\n '%Y-%m-%d %H:%M:%S',\n time.localtime(run_value.endtime)\n )\n ),\n 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost,\n 'single_best_test_accuracy': np.nan if run_value.additional_info is None else\n accuracy._optimum - run_value.additional_info['test_loss']['accuracy'],\n })\nindividual_performance_frame = pd.DataFrame(individual_performances)\n\n# Collect the performance of the ensemble through time\n# This ensemble is built from the machine learning algorithms\n# found by SMAC\nensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history)\n\n# As we are tracking the incumbent, we are interested in the cummax() performance\nensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[\n 'train_accuracy'\n].cummax()\nensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[\n 'test_accuracy'\n].cummax()\nensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True)\nindividual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[\n 'single_best_optimization_accuracy'\n].cummax()\nindividual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[\n 'single_best_test_accuracy'\n].cummax()\n\npd.merge(\n ensemble_performance_frame,\n individual_performance_frame,\n on=\"Timestamp\", how='outer'\n).sort_values('Timestamp').fillna(method='ffill').plot(\n x='Timestamp',\n kind='line',\n legend=True,\n title='Auto-PyTorch accuracy over time',\n grid=True,\n)\nplt.show()"
]
}
],
Expand All @@ -118,7 +118,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Fit a single configuration\n*Auto-PyTorch* searches for the best combination of machine learning algorithms\nand their hyper-parameter configuration for a given task.\nThis example shows how one can fit one of these pipelines, both, with a user defined\nconfiguration, and a randomly sampled one form the configuration space.\nThe pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can\nget further documentation about Scikit-Learn models here: <https://scikit-learn.org/stable/getting_started.html`>_\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\nimport tempfile as tmp\nimport warnings\n\nos.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()\nos.environ['OMP_NUM_THREADS'] = '1'\nos.environ['OPENBLAS_NUM_THREADS'] = '1'\nos.environ['MKL_NUM_THREADS'] = '1'\n\nwarnings.simplefilter(action='ignore', category=UserWarning)\nwarnings.simplefilter(action='ignore', category=FutureWarning)\n\nimport sklearn.datasets\nimport sklearn.metrics\n\nfrom autoPyTorch.api.tabular_classification import TabularClassificationTask\nfrom autoPyTorch.datasets.resampling_strategy import HoldoutValTypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Loading\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X, y, test_size=0.5, random_state=3\n)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define an estimator\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"estimator = TabularClassificationTask(\n resampling_strategy=HoldoutValTypes.holdout_validation,\n resampling_strategy_args={'val_share': 0.5},\n)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get a configuration of the pipeline for current dataset\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dataset = estimator.get_dataset(X_train=X_train,\n y_train=y_train,\n X_test=X_test,\n y_test=y_test,\n dataset_name='kr-vs-kp')\nconfiguration = estimator.get_search_space(dataset).get_default_configuration()\n\nprint(\"Passed Configuration:\", configuration)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fit the configuration\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset,\n configuration=configuration,\n budget_type='epochs',\n budget=5,\n run_time_limit_secs=75\n )\n\n# The fit_pipeline command also returns a named tuple with the pipeline constraints\nprint(run_info)\n\n# The fit_pipeline command also returns a named tuple with train/test performance\nprint(run_value)\n\n# This object complies with Scikit-Learn Pipeline API.\n# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html\nprint(pipeline.named_steps)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit e141e4c

Please sign in to comment.