diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index a0265bedf1..d787e29dc3 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -56,6 +56,10 @@ jobs: python setup.py sdist last_dist=$(ls -t dist/auto-sklearn-*.tar.gz | head -n 1) pip install $last_dist + - name: Store repository status + id: status-before + run: | + echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Conda Run tests if: matrix.use-conda == true run: | @@ -66,7 +70,7 @@ jobs: # to change the default python export PATH="$CONDA/envs/testenv/bin:$PATH" if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi - $CONDA/envs/testenv/bin/python3 -m pytest --durations=20 -sv $codecov test + $CONDA/envs/testenv/bin/python3 -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test - name: Run tests if: matrix.use-conda == false run: | @@ -74,7 +78,18 @@ jobs: export OMP_NUM_THREADS=1 export MKL_NUM_THREADS=1 if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi - pytest --durations=20 -sv $codecov test + pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test + - name: Check for files left behind by test + if: ${{ always() }} + run: | + before="${{ steps.status-before.outputs.BEFORE }}" + after="$(git status --porcelain -b)" + if [[ "$before" != "$after" ]]; then + echo "git status from before: $before" + echo "git status from after: $after" + echo "Not all generated files have been deleted!" + exit 1 + fi - name: Upload coverage if: matrix.code-cov && always() uses: codecov/codecov-action@v1 diff --git a/.gitignore b/.gitignore index 604e0e12d7..58595df4f7 100755 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,24 @@ number_submission .pypirc dmypy.json *.log + +# Dask created work space +dask-worker-space + +# Python distribution generated files +.eggs + +# Unit test / coverage reports +htmlcov/ +cover +coverage +htmlcov +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ +prof/ diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 79c6b98912..51ab9ceb1e 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -14,6 +14,7 @@ import tempfile from ConfigSpace.read_and_write import json as cs_json +import dask import dask.distributed import numpy as np import numpy.ma as ma @@ -230,10 +231,11 @@ def __init__(self, def _create_dask_client(self): self._is_dask_client_internally_created = True + dask.config.set({'distributed.worker.daemon': False}) self._dask_client = dask.distributed.Client( dask.distributed.LocalCluster( n_workers=self._n_jobs, - processes=False, + processes=True, threads_per_worker=1, # We use the temporal directory to save the # dask workers, because deleting workers @@ -269,9 +271,7 @@ def _get_logger(self, name): # This is gonna be honored by the server # Which is created below setup_logger( - output_file=os.path.join( - self._backend.temporary_directory, '%s.log' % str(logger_name) - ), + filename='%s.log' % str(logger_name), logging_config=self.logging_config, output_dir=self._backend.temporary_directory, ) @@ -294,9 +294,7 @@ def _get_logger(self, name): logname=logger_name, event=self.stop_logging_server, port=port, - output_file=os.path.join( - self._backend.temporary_directory, '%s.log' % str(logger_name) - ), + filename='%s.log' % str(logger_name), logging_config=self.logging_config, output_dir=self._backend.temporary_directory, ), diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 2bb97d8b54..c402e6920b 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -454,7 +454,11 @@ def __init__( # Setup the logger self.logger_port = logger_port - self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port) + self.logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + output_dir=self.backend.temporary_directory, + ) if ensemble_nbest == 1: self.logger.debug("Behaviour depends on int/float: %s, %s (ensemble_nbest, type)" % @@ -556,7 +560,11 @@ def run( elif time_left is not None and end_at is not None: raise ValueError('Cannot provide both time_left and end_at.') - self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port) + self.logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + output_dir=self.backend.temporary_directory, + ) process_start_time = time.time() while True: @@ -627,7 +635,11 @@ def main(self, time_left, iteration, return_predictions): # Pynisher jobs inside dask 'forget' # the logger configuration. So we have to set it up # accordingly - self.logger = get_named_client_logger('EnsembleBuilder', port=self.logger_port) + self.logger = get_named_client_logger( + name='EnsembleBuilder', + port=self.logger_port, + output_dir=self.backend.temporary_directory, + ) self.start_time = time.time() train_pred, valid_pred, test_pred = None, None, None diff --git a/autosklearn/util/backend.py b/autosklearn/util/backend.py index 9606c17642..0cb90fb337 100644 --- a/autosklearn/util/backend.py +++ b/autosklearn/util/backend.py @@ -81,8 +81,12 @@ def __init__(self, ) ) self._output_directory = output_directory - self._logger = logging.get_logger(__name__) self.create_directories() + # This is the first place the logger gets created. + # We want to make sure any logging forward sets the correct directory + # were all files should be created + logging.setup_logger(output_dir=self._temporary_directory) + self._logger = logging.get_logger(__name__) @property def output_directory(self) -> Optional[str]: diff --git a/autosklearn/util/hash.py b/autosklearn/util/hash.py index 0c11343ba4..d65abfdf18 100644 --- a/autosklearn/util/hash.py +++ b/autosklearn/util/hash.py @@ -8,6 +8,9 @@ def hash_array_or_matrix(X: np.ndarray) -> str: m = hashlib.md5() + if hasattr(X, "iloc"): + X = X.to_numpy() + if scipy.sparse.issparse(X): m.update(X.indices) m.update(X.indptr) diff --git a/autosklearn/util/logging_.py b/autosklearn/util/logging_.py index 754cb124b7..48ef805b20 100644 --- a/autosklearn/util/logging_.py +++ b/autosklearn/util/logging_.py @@ -16,30 +16,29 @@ def setup_logger( - output_file: Optional[str] = None, + output_dir: str, + filename: Optional[str] = None, + distributedlog_filename: Optional[str] = None, logging_config: Optional[Dict] = None, - output_dir: Optional[str] = None, ) -> None: # logging_config must be a dictionary object specifying the configuration # for the loggers to be used in auto-sklearn. - if logging_config is not None: - if output_file is not None: - logging_config['handlers']['file_handler']['filename'] = output_file - if output_dir is not None: - logging_config['handlers']['distributed_logfile']['filename'] = os.path.join( - output_dir, 'distributed.log' - ) - logging.config.dictConfig(logging_config) - else: + if logging_config is None: with open(os.path.join(os.path.dirname(__file__), 'logging.yaml'), 'r') as fh: logging_config = yaml.safe_load(fh) - if output_file is not None: - logging_config['handlers']['file_handler']['filename'] = output_file - if output_dir is not None: - logging_config['handlers']['distributed_logfile']['filename'] = os.path.join( - output_dir, 'distributed.log' - ) - logging.config.dictConfig(logging_config) + + if filename is None: + filename = logging_config['handlers']['file_handler']['filename'] + logging_config['handlers']['file_handler']['filename'] = os.path.join( + output_dir, filename + ) + + if distributedlog_filename is None: + distributedlog_filename = logging_config['handlers']['distributed_logfile']['filename'] + logging_config['handlers']['distributed_logfile']['filename'] = os.path.join( + output_dir, distributedlog_filename + ) + logging.config.dictConfig(logging_config) def _create_logger(name: str) -> logging.Logger: @@ -107,15 +106,22 @@ def isEnabledFor(self, level: int) -> bool: def get_named_client_logger( + output_dir: str, name: str, host: str = 'localhost', port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, ) -> 'PicklableClientLogger': - logger = PicklableClientLogger(name, host, port) + logger = PicklableClientLogger( + output_dir=output_dir, + name=name, + host=host, + port=port + ) return logger def _get_named_client_logger( + output_dir: str, name: str, host: str = 'localhost', port: int = logging.handlers.DEFAULT_TCP_LOGGING_PORT, @@ -133,6 +139,8 @@ def _get_named_client_logger( Parameters ---------- + outputdir: (str) + The path where the log files are going to be dumped name: (str) the name of the logger, used to tag the messages in the main log host: (str) @@ -143,7 +151,7 @@ def _get_named_client_logger( local_loger: a logger object that has a socket handler """ # Setup the logger configuration - setup_logger() + setup_logger(output_dir=output_dir) local_logger = _create_logger(name) @@ -159,11 +167,17 @@ def _get_named_client_logger( class PicklableClientLogger(PickableLoggerAdapter): - def __init__(self, name: str, host: str, port: int): + def __init__(self, output_dir: str, name: str, host: str, port: int): + self.output_dir = output_dir self.name = name self.host = host self.port = port - self.logger = _get_named_client_logger(name, host, port) + self.logger = _get_named_client_logger( + output_dir=output_dir, + name=name, + host=host, + port=port + ) def __getstate__(self) -> Dict[str, Any]: """ @@ -174,7 +188,12 @@ def __getstate__(self) -> Dict[str, Any]: Dictionary, representing the object state to be pickled. Ignores the self.logger field and only returns the logger name. """ - return {'name': self.name, 'host': self.host, 'port': self.port} + return { + 'name': self.name, + 'host': self.host, + 'port': self.port, + 'output_dir': self.output_dir, + } def __setstate__(self, state: Dict[str, Any]) -> None: """ @@ -189,7 +208,13 @@ def __setstate__(self, state: Dict[str, Any]) -> None: self.name = state['name'] self.host = state['host'] self.port = state['port'] - self.logger = _get_named_client_logger(self.name, self.host, self.port) + self.output_dir = state['output_dir'] + self.logger = _get_named_client_logger( + name=self.name, + host=self.host, + port=self.port, + output_dir=self.output_dir, + ) class LogRecordStreamHandler(socketserver.StreamRequestHandler): @@ -242,11 +267,13 @@ def start_log_server( logname: str, event: threading.Event, port: multiprocessing.Value, - output_file: str, + filename: str, logging_config: Dict, output_dir: str, ) -> None: - setup_logger(output_file, logging_config, output_dir) + setup_logger(filename=filename, + logging_config=logging_config, + output_dir=output_dir) while True: # Loop until we find a valid port diff --git a/test/conftest.py b/test/conftest.py index 4db32f6af7..4047711b01 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,6 +3,7 @@ import time import unittest.mock +import dask from dask.distributed import Client, get_client import psutil import pytest @@ -124,7 +125,8 @@ def dask_client(request): Workers are in subprocesses to not create deadlocks with the pynisher and logging. """ - client = Client(n_workers=2, threads_per_worker=1, processes=False) + dask.config.set({'distributed.worker.daemon': False}) + client = Client(n_workers=2, threads_per_worker=1, processes=True) print("Started Dask client={}\n".format(client)) def get_finalizer(address): @@ -149,6 +151,7 @@ def dask_client_single_worker(request): it is used very rarely to avoid this issue as much as possible. """ + dask.config.set({'distributed.worker.daemon': False}) client = Client(n_workers=1, threads_per_worker=1, processes=False) print("Started Dask client={}\n".format(client)) diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index caa43d037c..31364f966a 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -92,7 +92,7 @@ def get_roar_object_callback( metric=accuracy, dask_client=dask_client_single_worker, ) - setup_logger() + setup_logger(backend.temporary_directory) automl._logger = get_logger('test_fit_roar') automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, @@ -279,7 +279,7 @@ def test_automl_outputs(backend, dask_client): metric=accuracy, dask_client=dask_client, ) - setup_logger() + setup_logger(backend.temporary_directory) auto._logger = get_logger('test_automl_outputs') auto.fit( X=X_train, @@ -337,48 +337,47 @@ def test_automl_outputs(backend, dask_client): del auto -def test_do_dummy_prediction(backend, dask_client): - datasets = { - 'breast_cancer': BINARY_CLASSIFICATION, - 'wine': MULTICLASS_CLASSIFICATION, - 'diabetes': REGRESSION, - } +@pytest.mark.parametrize("datasets", [('breast_cancer', BINARY_CLASSIFICATION), + ('wine', MULTICLASS_CLASSIFICATION), + ('diabetes', REGRESSION)]) +def test_do_dummy_prediction(backend, dask_client, datasets): - for name, task in datasets.items(): + name, task = datasets - X_train, Y_train, X_test, Y_test = putil.get_dataset(name) - datamanager = XYDataManager( - X_train, Y_train, - X_test, Y_test, - task=task, - dataset_name=name, - feat_type=None, - ) + X_train, Y_train, X_test, Y_test = putil.get_dataset(name) + datamanager = XYDataManager( + X_train, Y_train, + X_test, Y_test, + task=task, + dataset_name=name, + feat_type=None, + ) - auto = autosklearn.automl.AutoML( - backend, 20, 5, - initial_configurations_via_metalearning=25, - metric=accuracy, - dask_client=dask_client, - ) - setup_logger() - auto._logger = get_logger('test_do_dummy_predictions') - auto._backend.save_datamanager(datamanager) - D = backend.load_datamanager() - - # Check if data manager is correcly loaded - assert D.info['task'] == datamanager.info['task'] - auto._do_dummy_prediction(D, 1) - - # Ensure that the dummy predictions are not in the current working - # directory, but in the temporary directory. - assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) - assert os.path.exists(os.path.join( - backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', - 'predictions_ensemble_1_1_0.0.npy') - ) + auto = autosklearn.automl.AutoML( + backend, 20, 5, + initial_configurations_via_metalearning=25, + metric=accuracy, + dask_client=dask_client, + ) + setup_logger(backend.temporary_directory) + auto._logger = get_logger('test_do_dummy_predictions') - del auto + auto._backend.save_datamanager(datamanager) + D = backend.load_datamanager() + + # Check if data manager is correcly loaded + assert D.info['task'] == datamanager.info['task'] + auto._do_dummy_prediction(D, 1) + + # Ensure that the dummy predictions are not in the current working + # directory, but in the temporary directory. + assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')) + assert os.path.exists(os.path.join( + backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0', + 'predictions_ensemble_1_1_0.0.npy') + ) + + del auto @unittest.mock.patch('autosklearn.evaluation.ExecuteTaFuncWithQueue.run') @@ -402,7 +401,7 @@ def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client): metric=accuracy, dask_client=dask_client, ) - setup_logger() + setup_logger(backend.temporary_directory) auto._logger = get_logger('test_fail_if_dummy_prediction_fails') auto._backend._make_internals_directory() auto._backend.save_datamanager(datamanager) @@ -480,7 +479,7 @@ def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client): ) output_file = 'test_exceptions_inside_log.log' - setup_logger(output_file=output_file) + setup_logger(filename=output_file, output_dir=backend.temporary_directory) logger = get_logger('test_exceptions_inside_log') # Create a custom exception to prevent other errors to slip in @@ -500,11 +499,11 @@ class MyException(Exception): Y_train, task=MULTICLASS_CLASSIFICATION, ) - with open(output_file) as f: + with open(os.path.join(backend.temporary_directory, output_file)) as f: assert message in f.read() # Cleanup - os.unlink(output_file) + os.unlink(os.path.join(backend.temporary_directory, output_file)) @pytest.mark.parametrize("metric", [log_loss, balanced_accuracy]) diff --git a/test/test_util/example_config.yaml b/test/test_util/example_config.yaml index 7c93e1b846..84849a9b5e 100644 --- a/test/test_util/example_config.yaml +++ b/test/test_util/example_config.yaml @@ -18,6 +18,12 @@ handlers: formatter: simple filename: autosklearn.log + distributed_logfile: + class: logging.FileHandler + level: DEBUG + formatter: simple + filename: distributed.log + root: level: CRITICAL handlers: [console, file_handler] diff --git a/test/test_util/test_logging.py b/test/test_util/test_logging.py index 71760a690c..568593c7c8 100644 --- a/test/test_util/test_logging.py +++ b/test/test_util/test_logging.py @@ -2,7 +2,10 @@ import unittest import logging import logging.config +import tempfile import yaml + + from autosklearn.util import logging_ @@ -17,15 +20,28 @@ def test_setup_logger(self): example_config = yaml.safe_load(fh) # Configure logger with example_config.yaml. - logging_.setup_logger(logging_config=example_config) + logging_.setup_logger(logging_config=example_config, + output_dir=tempfile.gettempdir()) # example_config sets the root logger's level to CRITICAL, # which corresponds to 50. self.assertEqual(logging.getLogger().getEffectiveLevel(), 50) # This time use the default configuration. - logging_.setup_logger(logging_config=None) + logging_.setup_logger(logging_config=None, + output_dir=tempfile.gettempdir()) # default config sets the root logger's level to DEBUG, # which corresponds to 10. self.assertEqual(logging.getLogger().getEffectiveLevel(), 10) + + # Make sure we log to the desired directory + logging_.setup_logger(output_dir=os.path.dirname(__file__), + filename='test.log' + ) + logger = logging.getLogger() + logger.info('test_setup_logger') + + with open(os.path.join(os.path.dirname(__file__), 'test.log')) as fh: + self.assertIn('test_setup_logger', ''.join(fh.readlines())) + os.remove(os.path.join(os.path.dirname(__file__), 'test.log'))