/
gpu.py
89 lines (71 loc) · 3.02 KB
/
gpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import subprocess
import time
from multiprocessing import TimeoutError
import logging
from vw_serving.sagemaker.exceptions import CustomerValueError
AUTODETECT_GPU_COUNT = "auto"
_num_gpus = None
def _query_num_gpus():
"""
Returns the number of GPU devices on the host. Returns 0 if the host has no GPU devices.
"""
global _num_gpus
if _num_gpus is None:
COMMAND = 'nvidia-smi -L 2>/dev/null | grep \'GPU [0-9]\' | wc -l'
TIMEOUT_SECONDS = 75
STATUS_POLL_INTERVAL_SECONDS = 0.025
try:
proc = subprocess.Popen(COMMAND, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, bufsize=1)
except (OSError, ValueError):
logging.exception("Error launching /usr/bin/nvidia-smi.")
return 0
start_time = time.time()
# Wait for the process to finish
exitcode = None
while exitcode is None and time.time() - start_time < TIMEOUT_SECONDS:
time.sleep(STATUS_POLL_INTERVAL_SECONDS)
exitcode = proc.poll()
# Terminate the process if not finished
if exitcode is None:
logging.error("nvidia-smi timed out after %s secs", time.time() - start_time)
proc.terminate()
raise TimeoutError
_num_gpus = int(proc.stdout.readline())
logging.info("nvidia-smi took: %s secs to identify %d gpus", time.time() - start_time, _num_gpus)
return _num_gpus
def get_num_gpus(num_gpus=AUTODETECT_GPU_COUNT, **kwargs):
"""
Returns the number of available GPUs based on configuration parameters and available hardware GPU devices.
:param num_gpus: (int or "auto")
If set to "auto", the function queries and returns the number of available GPUs.
If set to an integer value, the function returns the value of min(num_gpus, auto_detected_gpu_count)
Otherwise raises ValueError.
:param kwargs: additional configuration parameters, not used
:return: (int) number of GPUs
"""
# Shortcut execution if what we want is 0 gpu, i.e. only cpu
if num_gpus == 0:
return 0
try:
num_available_gpus = _query_num_gpus()
except TimeoutError:
if num_gpus == AUTODETECT_GPU_COUNT:
return 0
else:
return num_gpus
if num_gpus == AUTODETECT_GPU_COUNT:
return num_available_gpus
else:
try:
num_requested_gpus = int(num_gpus)
except ValueError:
raise CustomerValueError(
"Invalid value '{}' provided for hyperparameter '_num_gpus'. '_num_gpus' must be an integer or 'auto'. "
"Please set the value of '_num_gpus' hyperparameter to 'auto' or an integer value and try again."
.format(num_gpus))
if num_requested_gpus > num_available_gpus:
logging.warning("Request number of gpus: %d, Number of GPUs found: %d",
num_requested_gpus, num_available_gpus)
return num_available_gpus
else:
return num_requested_gpus