bastiscode: Adding tabular regression pipeline (#85)

automl · Feb 18, 2021 · 2129a3f · 2129a3f
1 parent a495490
commit 2129a3f
Show file tree

Hide file tree

Showing 28 changed files with 899 additions and 48 deletions.
diff --git a/...tor_development/_downloads/0baaec1666f007b22da0886cb1b9e240/example_tabular_regression.py b/...tor_development/_downloads/0baaec1666f007b22da0886cb1b9e240/example_tabular_regression.py
@@ -0,0 +1,116 @@
+"""
+======================
+Tabular Regression
+======================
+
+The following example shows how to fit a sample classification model
+with AutoPyTorch
+"""
+import os
+import tempfile as tmp
+import typing
+import warnings
+
+from sklearn.datasets import make_regression
+
+from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+from sklearn import model_selection, preprocessing
+
+from autoPyTorch.api.tabular_regression import TabularRegressionTask
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
+from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
+
+
+def get_search_space_updates():
+    """
+    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates
+    Returns:
+        HyperparameterSearchSpaceUpdates
+    """
+    updates = HyperparameterSearchSpaceUpdates()
+    updates.append(node_name="data_loader",
+                   hyperparameter="batch_size",
+                   value_range=[16, 512],
+                   default_value=32)
+    updates.append(node_name="lr_scheduler",
+                   hyperparameter="CosineAnnealingLR:T_max",
+                   value_range=[50, 60],
+                   default_value=55)
+    updates.append(node_name='network_backbone',
+                   hyperparameter='ResNetBackbone:dropout',
+                   value_range=[0, 0.5],
+                   default_value=0.2)
+    return updates
+
+
+if __name__ == '__main__':
+    ############################################################################
+    # Data Loading
+    # ============
+
+    # Get the training data for tabular regression
+    # X, y = datasets.fetch_openml(name="cholesterol", return_X_y=True)
+
+    # Use dummy data for now since there are problems with categorical columns
+    X, y = make_regression(
+        n_samples=5000,
+        n_features=4,
+        n_informative=3,
+        n_targets=1,
+        shuffle=True,
+        random_state=0
+    )
+
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(
+        X,
+        y,
+        random_state=1,
+    )
+
+    # Scale the regression targets to have zero mean and unit variance.
+    # This is important for Neural Networks since predicting large target values would require very large weights.
+    # One can later rescale the network predictions like this: y_pred = y_pred_scaled * y_train_std + y_train_mean
+    y_train_mean = y_train.mean()
+    y_train_std = y_train.std()
+
+    y_train_scaled = (y_train - y_train_mean) / y_train_std
+    y_test_scaled = (y_test - y_train_mean) / y_train_std
+
+    ############################################################################
+    # Build and fit a regressor
+    # ==========================
+    api = TabularRegressionTask(
+        delete_tmp_folder_after_terminate=False,
+        search_space_updates=get_search_space_updates()
+    )
+    api.search(
+        X_train=X_train,
+        y_train=y_train_scaled,
+        X_test=X_test.copy(),
+        y_test=y_test_scaled.copy(),
+        optimize_metric='r2',
+        total_walltime_limit=500,
+        func_eval_time_limit=50,
+        traditional_per_total_budget=0
+    )
+
+    ############################################################################
+    # Print the final ensemble performance
+    # ====================================
+    print(api.run_history, api.trajectory)
+    y_pred_scaled = api.predict(X_test)
+
+    # Rescale the Neural Network predictions into the original target range
+    y_pred = y_pred_scaled * y_train_std + y_train_mean
+    score = api.score(y_pred, y_test)
+
+    print(score)
diff --git a/..._development/_downloads/306036486863b5329c4111d8adbaac63/example_tabular_regression.ipynb b/..._development/_downloads/306036486863b5329c4111d8adbaac63/example_tabular_regression.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Tabular Regression\n\nThe following example shows how to fit a sample classification model\nwith AutoPyTorch\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nimport tempfile as tmp\nimport typing\nimport warnings\n\nfrom sklearn.datasets import make_regression\n\nfrom autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator\n\nos.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()\nos.environ['OMP_NUM_THREADS'] = '1'\nos.environ['OPENBLAS_NUM_THREADS'] = '1'\nos.environ['MKL_NUM_THREADS'] = '1'\n\nwarnings.simplefilter(action='ignore', category=UserWarning)\nwarnings.simplefilter(action='ignore', category=FutureWarning)\n\nfrom sklearn import model_selection, preprocessing\n\nfrom autoPyTorch.api.tabular_regression import TabularRegressionTask\nfrom autoPyTorch.datasets.tabular_dataset import TabularDataset\nfrom autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates\n\n\ndef get_search_space_updates():\n    \"\"\"\n    Search space updates to the task can be added using HyperparameterSearchSpaceUpdates\n    Returns:\n        HyperparameterSearchSpaceUpdates\n    \"\"\"\n    updates = HyperparameterSearchSpaceUpdates()\n    updates.append(node_name=\"data_loader\",\n                   hyperparameter=\"batch_size\",\n                   value_range=[16, 512],\n                   default_value=32)\n    updates.append(node_name=\"lr_scheduler\",\n                   hyperparameter=\"CosineAnnealingLR:T_max\",\n                   value_range=[50, 60],\n                   default_value=55)\n    updates.append(node_name='network_backbone',\n                   hyperparameter='ResNetBackbone:dropout',\n                   value_range=[0, 0.5],\n                   default_value=0.2)\n    return updates\n\n\nif __name__ == '__main__':\n    ############################################################################\n    # Data Loading\n    # ============\n\n    # Get the training data for tabular regression\n    # X, y = datasets.fetch_openml(name=\"cholesterol\", return_X_y=True)\n\n    # Use dummy data for now since there are problems with categorical columns\n    X, y = make_regression(\n        n_samples=5000,\n        n_features=4,\n        n_informative=3,\n        n_targets=1,\n        shuffle=True,\n        random_state=0\n    )\n\n    X_train, X_test, y_train, y_test = model_selection.train_test_split(\n        X,\n        y,\n        random_state=1,\n    )\n\n    # Scale the regression targets to have zero mean and unit variance.\n    # This is important for Neural Networks since predicting large target values would require very large weights.\n    # One can later rescale the network predictions like this: y_pred = y_pred_scaled * y_train_std + y_train_mean\n    y_train_mean = y_train.mean()\n    y_train_std = y_train.std()\n\n    y_train_scaled = (y_train - y_train_mean) / y_train_std\n    y_test_scaled = (y_test - y_train_mean) / y_train_std\n\n    ############################################################################\n    # Build and fit a regressor\n    # ==========================\n    api = TabularRegressionTask(\n        delete_tmp_folder_after_terminate=False,\n        search_space_updates=get_search_space_updates()\n    )\n    api.search(\n        X_train=X_train,\n        y_train=y_train_scaled,\n        X_test=X_test.copy(),\n        y_test=y_test_scaled.copy(),\n        optimize_metric='r2',\n        total_walltime_limit=500,\n        func_eval_time_limit=50,\n        traditional_per_total_budget=0\n    )\n\n    ############################################################################\n    # Print the final ensemble performance\n    # ====================================\n    print(api.run_history, api.trajectory)\n    y_pred_scaled = api.predict(X_test)\n\n    # Rescale the Neural Network predictions into the original target range\n    y_pred = y_pred_scaled * y_train_std + y_train_mean\n    score = api.score(y_pred, y_test)\n\n    print(score)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.7"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/refactor_development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip b/refactor_development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip
diff --git a/refactor_development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip b/refactor_development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip
diff --git a/refactor_development/_images/sphx_glr_example_tabular_regression_thumb.png b/refactor_development/_images/sphx_glr_example_tabular_regression_thumb.png
diff --git a/refactor_development/_modules/autoPyTorch/api/tabular_classification.html b/refactor_development/_modules/autoPyTorch/api/tabular_classification.html
@@ -390,7 +390,7 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
     </p>
     <p>
         &copy; Copyright 2014-2019, Machine Learning Professorship Freiburg.<br/>
-      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.5.0.<br/>
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.5.1.<br/>
     </p>
   </div>
 </footer>

diff --git a/refactor_development/_modules/index.html b/refactor_development/_modules/index.html
@@ -128,7 +128,7 @@ <h1>All modules for which code is available</h1>
     </p>
     <p>
         &copy; Copyright 2014-2019, Machine Learning Professorship Freiburg.<br/>
-      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.5.0.<br/>
+      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.5.1.<br/>
     </p>
   </div>
 </footer>

diff --git a/refactor_development/_sources/examples/example_image_classification.rst.txt b/refactor_development/_sources/examples/example_image_classification.rst.txt
@@ -77,22 +77,21 @@ Image Classification
     Pipeline Random Config:
      ________________________________________ 
     Configuration:
-      image_augmenter:GaussianBlur:sigma_min, Value: 0.3669408750205385
-      image_augmenter:GaussianBlur:sigma_offset, Value: 1.089417074756883
-      image_augmenter:GaussianBlur:use_augmenter, Value: True
+      image_augmenter:GaussianBlur:use_augmenter, Value: False
       image_augmenter:GaussianNoise:use_augmenter, Value: False
       image_augmenter:RandomAffine:use_augmenter, Value: False
-      image_augmenter:RandomCutout:use_augmenter, Value: False
+      image_augmenter:RandomCutout:p, Value: 0.8490799303808481
+      image_augmenter:RandomCutout:use_augmenter, Value: True
       image_augmenter:Resize:use_augmenter, Value: False
-      image_augmenter:ZeroPadAndCrop:percent, Value: 0.03654863814289566
-      normalizer:__choice__, Value: 'NoNormalizer'
+      image_augmenter:ZeroPadAndCrop:percent, Value: 0.2993076189415605
+      normalizer:__choice__, Value: 'ImageNormalizer'
 
     Fitting the pipeline...
     ________________________________________
             ImageClassificationPipeline
     ________________________________________
     0-) normalizer: 
-            NoNormalizer
+            ImageNormalizer
 
     1-) preprocessing: 
             EarlyPreprocessing
@@ -164,7 +163,7 @@ Image Classification
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  5.940 seconds)
+   **Total running time of the script:** ( 0 minutes  7.334 seconds)
 
 
 .. _sphx_glr_download_examples_example_image_classification.py:

diff --git a/refactor_development/_sources/examples/example_tabular_classification.rst.txt b/refactor_development/_sources/examples/example_tabular_classification.rst.txt
@@ -36,7 +36,7 @@ with AutoPyTorch
 
  .. code-block:: none
 
-    <smac.runhistory.runhistory.RunHistory object at 0x7f8679d7a9d0> [TrajEntry(train_perf=2147483648, incumbent_id=1, incumbent=Configuration:
+    <smac.runhistory.runhistory.RunHistory object at 0x7fd25a7cde50> [TrajEntry(train_perf=2147483648, incumbent_id=1, incumbent=Configuration:
       data_loader:batch_size, Value: 32
       encoder:__choice__, Value: 'OneHotEncoder'
       feature_preprocessor:__choice__, Value: 'NoFeaturePreprocessor'
@@ -65,8 +65,9 @@ with AutoPyTorch
       optimizer:AdamOptimizer:weight_decay, Value: 0.0
       optimizer:__choice__, Value: 'AdamOptimizer'
       scaler:__choice__, Value: 'StandardScaler'
+      trainer:StandardTrainer:weighted_loss, Value: True
       trainer:__choice__, Value: 'StandardTrainer'
-    , ta_runs=0, ta_time_used=0.0, wallclock_time=0.0018706321716308594, budget=0), TrajEntry(train_perf=0.14035087719298245, incumbent_id=1, incumbent=Configuration:
+    , ta_runs=0, ta_time_used=0.0, wallclock_time=0.0015573501586914062, budget=0), TrajEntry(train_perf=0.17543859649122806, incumbent_id=1, incumbent=Configuration:
       data_loader:batch_size, Value: 32
       encoder:__choice__, Value: 'OneHotEncoder'
       feature_preprocessor:__choice__, Value: 'NoFeaturePreprocessor'
@@ -95,9 +96,10 @@ with AutoPyTorch
       optimizer:AdamOptimizer:weight_decay, Value: 0.0
       optimizer:__choice__, Value: 'AdamOptimizer'
       scaler:__choice__, Value: 'StandardScaler'
+      trainer:StandardTrainer:weighted_loss, Value: True
       trainer:__choice__, Value: 'StandardTrainer'
-    , ta_runs=1, ta_time_used=5.345165252685547, wallclock_time=6.850844860076904, budget=5.555555555555555)]
-    {'accuracy': 0.8786127167630058}
+    , ta_runs=1, ta_time_used=3.7913620471954346, wallclock_time=5.133898019790649, budget=5.555555555555555)]
+    {'accuracy': 0.8728323699421965}
 
 
 
@@ -188,7 +190,7 @@ with AutoPyTorch
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 9 minutes  10.874 seconds)
+   **Total running time of the script:** ( 8 minutes  58.645 seconds)
 
 
 .. _sphx_glr_download_examples_example_tabular_classification.py: