Skip to content

Commit

Permalink
Merge pull request #284 from asappresearch/hotfix-ray-debug-jeremyasapp
Browse files Browse the repository at this point in the history
Hotfix ray debug jeremyasapp
  • Loading branch information
jeremyasapp committed Mar 31, 2020
2 parents 76c791e + 4bd3958 commit 0cfdec3
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 6 deletions.
7 changes: 2 additions & 5 deletions flambe/experiment/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from flambe.experiment.progress import ProgressState
from flambe.experiment.tune_adapter import TuneAdapter
from flambe.logging import coloredlogs as cl
from flambe.experiment.utils import get_default_devices

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -275,11 +276,7 @@ def run(self, force: bool = False, verbose: bool = False, debug: bool = False, *
# By default use all CPUs if no GPU is present
devices = self.devices if self.devices else None
if devices is None:
cluster_devices = ray.cluster_resources()
if 'GPU' in cluster_devices or 'gpu' in cluster_devices:
devices = {"cpu": 4, "gpu": 1}
else:
devices = {"cpu": 1}
devices = get_default_devices()

to_resume = None
if isinstance(self.resume, str):
Expand Down
58 changes: 58 additions & 0 deletions flambe/experiment/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import abc
from typing import Dict, List, Mapping, Any, Optional, Iterable, Set, Sequence, MutableMapping

import ray
import torch
from ruamel.yaml.compat import StringIO
from ruamel import yaml as original_yaml
Expand Down Expand Up @@ -549,3 +550,60 @@ def shutdown_remote_ray_node(host: str,
"""
cmd = f"ssh -i {key} -o StrictHostKeyChecking=no {user}@{host} \"bash -lc 'ray stop'\""
return os.system(cmd)


def get_default_devices(debug: bool = False,
default_cpus: int = 1,
default_gpus: int = 1) -> Dict[str, int]:
"""Get the default devices to use if not provided.
Parameters
----------
debug: bool, optional
If we are running in debug mode (where Ray is not available).
If debug is False, this method should be called after running
``ray.init()``.
default_cpus: int, optional
The default number of CPU's to use. Default ``1``.
default_gpus: int, optional
The default number of GPU's to use. Default ``1``.
Returns
-------
devices: Dict[str, int]
Default set of devices to use. Should have at most two keys:
'cpu', and 'gpu' if cuda is available.
Raises
------
ValueError
If the total number of GPUs is larger than the number available.
"""
# Get available resources
if not debug and ray.is_initialized():
cluster_devices = ray.cluster_resources()
num_cpus = max(cluster_devices.get('CPU', 0), cluster_devices.get('cpu', 0))
num_gpus = max(cluster_devices.get('GPU', 0), cluster_devices.get('gpu', 0))
elif torch.cuda.is_available():
num_cpus = os.cpu_count()
num_gpus = torch.cuda.device_count()
else:
num_cpus = os.cpu_count()
num_gpus = 0

# Check that requested is not larger than available
if default_cpus > num_cpus:
raise ValueError(f"Number of CPUs requested ({default_cpus}) is larger \
than the total number available ({num_cpus}).")
if num_gpus > 0 and default_gpus > num_gpus:
raise ValueError(f"Number of GPUs requested ({default_gpus}) is larger \
than the total number available ({num_gpus}).")

# Set provided defaults
if num_gpus > 0:
devices = {"cpu": default_cpus, "gpu": default_gpus}
else:
devices = {"cpu": default_cpus}

return devices
75 changes: 74 additions & 1 deletion tests/unit/experiment/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
import mock

from flambe.compile import Component, yaml
from flambe.experiment.utils import divide_nested_grid_search_options
from flambe.experiment.utils import divide_nested_grid_search_options, get_default_devices


@pytest.fixture
Expand Down Expand Up @@ -112,3 +113,75 @@ def test_divide_nested_grid_search_options_nested_options(make_classes):
config2 = yaml.load(txt_2)
divided_configs = list(divide_nested_grid_search_options(config))
assert repr(divided_configs) == repr([config1, config2])


@pytest.mark.parametrize("initialized", [True, False])
@pytest.mark.parametrize("debug", [True, False])
@mock.patch("flambe.experiment.utils.torch.cuda.device_count")
@mock.patch("flambe.experiment.utils.os.cpu_count")
@mock.patch("flambe.experiment.utils.ray.cluster_resources")
@mock.patch("flambe.experiment.utils.ray.is_initialized")
@mock.patch("flambe.experiment.utils.torch.cuda.is_available")
def test_default_devices_cpu(cuda_available,
ray_initialized,
resources,
cpu_count,
gpu_count,
debug,
initialized):
cuda_available.return_value = False
ray_initialized.return_value = initialized
cpu_count.return_value = 2
gpu_count.return_value = 0
resources.return_value = {'cpu': 2}

devices = get_default_devices(debug=debug)
assert devices == {'cpu': 1}

devices = get_default_devices(debug=debug, default_cpus=2)
assert devices == {'cpu': 2}

devices = get_default_devices(debug=debug, default_gpus=1)
assert devices == {'cpu': 1}

with pytest.raises(ValueError):
get_default_devices(debug=debug, default_cpus=3)


@pytest.mark.parametrize("initialized", [True, False])
@pytest.mark.parametrize("debug", [True, False])
@mock.patch("flambe.experiment.utils.torch.cuda.device_count")
@mock.patch("flambe.experiment.utils.os.cpu_count")
@mock.patch("flambe.experiment.utils.ray.cluster_resources")
@mock.patch("flambe.experiment.utils.ray.is_initialized")
@mock.patch("flambe.experiment.utils.torch.cuda.is_available")
def test_default_devices_gpu(cuda_available,
ray_initialized,
resources,
cpu_count,
gpu_count,
debug,
initialized):
cuda_available.return_value = True
ray_initialized.return_value = initialized
cpu_count.return_value = 2
gpu_count.return_value = 2
resources.return_value = {'cpu': 2, 'gpu': 2}

devices = get_default_devices(debug=debug)
assert devices == {'cpu': 1, 'gpu': 1}

devices = get_default_devices(debug=debug, default_cpus=2)
assert devices == {'cpu': 2, 'gpu': 1}

devices = get_default_devices(debug=debug, default_gpus=2)
assert devices == {'cpu': 1, 'gpu': 2}

devices = get_default_devices(debug=debug, default_cpus=2, default_gpus=2)
assert devices == {'cpu': 2, 'gpu': 2}

with pytest.raises(ValueError):
get_default_devices(debug=debug, default_gpus=3)

with pytest.raises(ValueError):
get_default_devices(debug=debug, default_cpus=3)

0 comments on commit 0cfdec3

Please sign in to comment.