Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve ensemble selection memory usage #997

Merged
merged 4 commits into from
Nov 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ matrix:
- os: linux
env: DISTRIB="conda" DOCPUSH="true" PYTHON="3.7" SKIP_TESTS="true"
- os: linux
env: DISTRIB="conda" RUN_FLAKE8="true" SKIP_TESTS="true"
env: DISTRIB="conda" PYTHON="3.8" RUN_FLAKE8="true" SKIP_TESTS="true"
- os: linux
env: DISTRIB="conda" RUN_MYPY="true" SKIP_TESTS="true"
env: DISTRIB="conda" PYTHON="3.8" RUN_MYPY="true" SKIP_TESTS="true"
- os: linux
env: DISTRIB="conda" COVERAGE="true" PYTHON="3.6"
- os: linux
Expand Down
5 changes: 4 additions & 1 deletion autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,10 @@ def fit_ensemble(self, y, task=None, precision=32,
future = manager.futures.pop()
dask.distributed.wait([future]) # wait for the ensemble process to finish
result = future.result()
self.ensemble_performance_history, _ = result
if result is None:
raise ValueError("Error building the ensemble - please check the log file and command "
"line output for error messages.")
self.ensemble_performance_history, _, _, _, _ = result

self._load_models()
self._close_dask_client()
Expand Down
325 changes: 190 additions & 135 deletions autosklearn/ensemble_builder.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions autosklearn/ensembles/abstract_ensemble.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABCMeta, abstractmethod
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Union

import numpy as np

Expand Down Expand Up @@ -40,7 +40,7 @@ def fit(
pass

@abstractmethod
def predict(self, base_models_predictions: np.ndarray) -> np.ndarray:
def predict(self, base_models_predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
"""Create ensemble predictions from the base model predictions.

Parameters
Expand Down
25 changes: 15 additions & 10 deletions autosklearn/ensembles/ensemble_selection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import random
from collections import Counter
from typing import Any, Dict, List, Tuple, cast
from typing import Any, Dict, List, Tuple, Union, cast

import numpy as np

Expand Down Expand Up @@ -265,27 +265,32 @@ def _bagging(
dtype=np.int64,
)

def predict(self, predictions: np.ndarray) -> np.ndarray:
predictions = np.asarray(
predictions,
dtype=np.float64,
)
def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:

average = np.zeros_like(predictions[0], dtype=np.float64)
tmp_predictions = np.empty_like(predictions[0], dtype=np.float64)

# if predictions.shape[0] == len(self.weights_),
# predictions include those of zero-weight models.
if predictions.shape[0] == len(self.weights_):
return np.average(predictions, axis=0, weights=self.weights_)
if len(predictions) == len(self.weights_):
for pred, weight in zip(predictions, self.weights_):
np.multiply(pred, weight, out=tmp_predictions)
np.add(average, tmp_predictions, out=average)

# if prediction model.shape[0] == len(non_null_weights),
# predictions do not include those of zero-weight models.
elif predictions.shape[0] == np.count_nonzero(self.weights_):
elif len(predictions) == np.count_nonzero(self.weights_):
non_null_weights = [w for w in self.weights_ if w > 0]
return np.average(predictions, axis=0, weights=non_null_weights)
for pred, weight in zip(predictions, non_null_weights):
np.multiply(pred, weight, out=tmp_predictions)
np.add(average, tmp_predictions, out=average)

# If none of the above applies, then something must have gone wrong.
else:
raise ValueError("The dimensions of ensemble predictions"
" and ensemble weights do not match!")
del tmp_predictions
return average

def __str__(self) -> str:
return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
Expand Down
4 changes: 2 additions & 2 deletions autosklearn/ensembles/singlebest_ensemble.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import List, Tuple
from typing import List, Tuple, Union

import numpy as np

Expand Down Expand Up @@ -85,7 +85,7 @@ def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]:

return best_model_identifier

def predict(self, predictions: np.ndarray) -> np.ndarray:
def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
return predictions[0]

def __str__(self) -> str:
Expand Down
7 changes: 6 additions & 1 deletion autosklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
logging_config=None,
metadata_directory=None,
metric=None,
load_models: bool = True,
):
"""
Parameters
Expand Down Expand Up @@ -216,6 +217,9 @@ def __init__(
:meth:`autosklearn.metrics.make_scorer`. These are the `Built-in
Metrics`_.
If None is provided, a default metric is selected depending on the task.

load_models : bool, optional (True)
Whether to load the models after fitting Auto-sklearn.

Attributes
----------
Expand Down Expand Up @@ -257,6 +261,7 @@ def __init__(
self.logging_config = logging_config
self.metadata_directory = metadata_directory
self._metric = metric
self._load_models = load_models

self.automl_ = None # type: Optional[AutoML]
# n_jobs after conversion to a number (b/c default is None)
Expand Down Expand Up @@ -340,7 +345,7 @@ def fit(self, **kwargs):
tmp_folder=self.tmp_folder,
output_folder=self.output_folder,
)
self.automl_.fit(load_models=True, **kwargs)
self.automl_.fit(load_models=self._load_models, **kwargs)

return self

Expand Down
1 change: 1 addition & 0 deletions test/test_automl/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ def test_automl_outputs(backend, dask_client):
'start_time_100',
'datamanager.pkl',
'ensemble_read_preds.pkl',
'ensemble_read_scores.pkl',
'runs',
'ensembles',
]
Expand Down
28 changes: 15 additions & 13 deletions test/test_ensemble_builder/ensemble_utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import os
import shutil
import unittest


import numpy as np

from autosklearn.metrics import make_scorer
from autosklearn.ensemble_builder import (
EnsembleBuilder,
EnsembleBuilder, AbstractEnsemble
)

this_directory = os.path.dirname(__file__)


def scorer_function(a, b):
return 0.9
Expand All @@ -21,22 +19,19 @@ def scorer_function(a, b):

class BackendMock(object):

def __init__(self):
def __init__(self, target_directory):
this_directory = os.path.abspath(
os.path.dirname(__file__)
)
self.temporary_directory = os.path.join(
this_directory, 'data',
)
self.internals_directory = os.path.join(
this_directory, 'data', '.auto-sklearn',
)
shutil.copytree(os.path.join(this_directory, 'data'), os.path.join(target_directory))
self.temporary_directory = target_directory
self.internals_directory = os.path.join(self.temporary_directory, '.auto-sklearn')

def load_datamanager(self):
manager = unittest.mock.Mock()
manager.__reduce__ = lambda self: (unittest.mock.MagicMock, ())
array = np.load(os.path.join(
this_directory, 'data',
self.temporary_directory,
'.auto-sklearn',
'runs', '0_3_100.0',
'predictions_test_0_3_100.0.npy'
Expand All @@ -60,7 +55,7 @@ def save_predictions_as_txt(self, predictions, subset, idx, prefix, precision):
return

def get_runs_directory(self) -> str:
return os.path.join(this_directory, 'data', '.auto-sklearn', 'runs')
return os.path.join(self.temporary_directory, '.auto-sklearn', 'runs')

def get_numrun_directory(self, seed: int, num_run: int, budget: float) -> str:
return os.path.join(self.get_runs_directory(), '%d_%d_%s' % (seed, num_run, budget))
Expand Down Expand Up @@ -97,4 +92,11 @@ def compare_read_preds(read_preds1, read_preds2):
class EnsembleBuilderMemMock(EnsembleBuilder):

def fit_ensemble(self, selected_keys):
return True

def predict(self, set_: str,
ensemble: AbstractEnsemble,
selected_keys: list,
n_preds: int,
index_run: int):
np.ones([10000000, 1000000])