|
12 | 12 |
|
13 | 13 | import logging
|
14 | 14 | import math
|
| 15 | +import subprocess |
15 | 16 | from textwrap import wrap
|
16 | 17 |
|
17 | 18 | from common.schedulers.converters import ComparableObject, from_table_to_obj_list
|
18 |
| -from common.utils import check_command_output |
| 19 | +from common.utils import check_command_output, run_command |
19 | 20 |
|
| 21 | +SLURM_NODE_ERROR_STATES = ["down", "drained", "fail"] |
| 22 | +SLURM_NODE_DISABLED_STATES = ["draining", "drained"] |
20 | 23 | PENDING_RESOURCES_REASONS = [
|
21 | 24 | "Resources",
|
22 | 25 | "Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions",
|
@@ -59,6 +62,19 @@ def get_jobs_info(job_state_filter=None):
|
59 | 62 | return SlurmJob.from_table(output)
|
60 | 63 |
|
61 | 64 |
|
| 65 | +def get_node_state(hostname): |
| 66 | + # retrieves the state of a specific node |
| 67 | + # https://slurm.schedmd.com/sinfo.html#lbAG |
| 68 | + # Output format: |
| 69 | + # down* |
| 70 | + try: |
| 71 | + command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n {}\"".format(hostname) |
| 72 | + output = check_command_output(command).strip() |
| 73 | + return output |
| 74 | + except Exception as e: |
| 75 | + logging.error("Failed when checking if node {} state with exception {}.".format(hostname, e)) |
| 76 | + |
| 77 | + |
62 | 78 | def get_pending_jobs_info(
|
63 | 79 | instance_properties=None, max_nodes_filter=None, filter_by_pending_reasons=None, log_pending_jobs=True
|
64 | 80 | ):
|
@@ -269,6 +285,33 @@ def job_runnable_on_given_node(job_resources_per_node, resources_available, exis
|
269 | 285 | return True
|
270 | 286 |
|
271 | 287 |
|
| 288 | +def lock_node(hostname, unlock=False, note=None): |
| 289 | + # hostname format: ip-10-0-0-114.eu-west-1.compute.internal |
| 290 | + hostname = hostname.split(".")[0] |
| 291 | + if unlock: |
| 292 | + logging.info("Unlocking host %s", hostname) |
| 293 | + command = [ |
| 294 | + "/opt/slurm/bin/scontrol", |
| 295 | + "update", |
| 296 | + "NodeName={0}".format(hostname), |
| 297 | + "State=RESUME", |
| 298 | + "Reason={}".format(note if note else '"Unlocking"'), |
| 299 | + ] |
| 300 | + else: |
| 301 | + logging.info("Locking host %s", hostname) |
| 302 | + command = [ |
| 303 | + "/opt/slurm/bin/scontrol", |
| 304 | + "update", |
| 305 | + "NodeName={0}".format(hostname), |
| 306 | + "State=DRAIN", |
| 307 | + "Reason={}".format(note if note else '"Shutting down"'), |
| 308 | + ] |
| 309 | + try: |
| 310 | + run_command(command) |
| 311 | + except subprocess.CalledProcessError: |
| 312 | + logging.error("Error %s host %s", "unlocking" if unlock else "locking", hostname) |
| 313 | + |
| 314 | + |
272 | 315 | class SlurmJob(ComparableObject):
|
273 | 316 | # This is the format after being processed by reformat_table function
|
274 | 317 | # JOBID|ST|NODES|CPUS|TASKS|CPUS_PER_TASK|MIN_CPUS|REASON|TRES_PER_JOB|TRES_PER_TASK
|
|
0 commit comments