aws
diff --git a/‎src/common/schedulers/slurm_commands.py‎
Lines changed: 44 additions & 1 deletion b/‎src/common/schedulers/slurm_commands.py‎
Lines changed: 44 additions & 1 deletion
diff --git a/‎src/common/schedulers/torque_commands.py‎
Lines changed: 16 additions & 1 deletion b/‎src/common/schedulers/torque_commands.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎src/common/utils.py‎
Lines changed: 36 additions & 0 deletions b/‎src/common/utils.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/nodewatcher/nodewatcher.py‎
Lines changed: 5 additions & 0 deletions b/‎src/nodewatcher/nodewatcher.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/nodewatcher/plugins/sge.py‎
Lines changed: 5 additions & 0 deletions b/‎src/nodewatcher/plugins/sge.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/nodewatcher/plugins/slurm.py‎
Lines changed: 12 additions & 33 deletions b/‎src/nodewatcher/plugins/slurm.py‎
Lines changed: 12 additions & 33 deletions
diff --git a/‎src/nodewatcher/plugins/torque.py‎
Lines changed: 8 additions & 11 deletions b/‎src/nodewatcher/plugins/torque.py‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎src/sqswatcher/plugins/sge.py‎
Lines changed: 47 additions & 0 deletions b/‎src/sqswatcher/plugins/sge.py‎
Lines changed: 47 additions & 0 deletions
@@ -12,11 +12,14 @@
 
 import logging
 import math
+import subprocess
 from textwrap import wrap
 
 from common.schedulers.converters import ComparableObject, from_table_to_obj_list
-from common.utils import check_command_output
+from common.utils import check_command_output, run_command
 
+SLURM_NODE_ERROR_STATES = ["down", "drained", "fail"]
+SLURM_NODE_DISABLED_STATES = ["draining", "drained"]
 PENDING_RESOURCES_REASONS = [
     "Resources",
     "Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions",
@@ -59,6 +62,19 @@ def get_jobs_info(job_state_filter=None):
     return SlurmJob.from_table(output)
 
 
+def get_node_state(hostname):
+    # retrieves the state of a specific node
+    # https://slurm.schedmd.com/sinfo.html#lbAG
+    # Output format:
+    # down*
+    try:
+        command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n {}\"".format(hostname)
+        output = check_command_output(command).strip()
+        return output
+    except Exception as e:
+        logging.error("Failed when checking if node {} state with exception {}.".format(hostname, e))
+
+
 def get_pending_jobs_info(
     instance_properties=None, max_nodes_filter=None, filter_by_pending_reasons=None, log_pending_jobs=True
 ):
@@ -269,6 +285,33 @@ def job_runnable_on_given_node(job_resources_per_node, resources_available, exis
     return True
 
 
+def lock_node(hostname, unlock=False, note=None):
+    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
+    hostname = hostname.split(".")[0]
+    if unlock:
+        logging.info("Unlocking host %s", hostname)
+        command = [
+            "/opt/slurm/bin/scontrol",
+            "update",
+            "NodeName={0}".format(hostname),
+            "State=RESUME",
+            "Reason={}".format(note if note else '"Unlocking"'),
+        ]
+    else:
+        logging.info("Locking host %s", hostname)
+        command = [
+            "/opt/slurm/bin/scontrol",
+            "update",
+            "NodeName={0}".format(hostname),
+            "State=DRAIN",
+            "Reason={}".format(note if note else '"Shutting down"'),
+        ]
+    try:
+        run_command(command)
+    except subprocess.CalledProcessError:
+        logging.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
+
+
 class SlurmJob(ComparableObject):
     # This is the format after being processed by reformat_table function
     # JOBID|ST|NODES|CPUS|TASKS|CPUS_PER_TASK|MIN_CPUS|REASON|TRES_PER_JOB|TRES_PER_TASK
 
@@ -17,7 +17,8 @@
 from common.schedulers.converters import ComparableObject, from_xml_to_obj
 from common.utils import check_command_output, run_command
 
-TORQUE_NODE_ERROR_STATES = ("down", "offline", "unknown")
+TORQUE_NODE_ERROR_STATES = ("down", "unknown")
+TORQUE_NODE_DISABLED_STATE = "offline"
 TORQUE_NODE_STATES = (
     "free",
     "offline",
@@ -129,6 +130,20 @@ def delete_nodes(hosts):
     return succeeded_hosts
 
 
+def lock_node(hostname, unlock=False, note=None):
+    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
+    hostname = hostname.split(".")[0]
+    mod = unlock and "-c" or "-o"
+    command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname]
+    if note:
+        command.append("-N '{}'".format(note))
+    try:
+        run_command(command)
+    except subprocess.CalledProcessError:
+        logging.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
+        raise
+
+
 def update_cluster_limits(max_nodes, node_slots):
     try:
         logging.info("Updating cluster limits: max_nodes=%d, node_slots=%d", max_nodes, node_slots)
 
@@ -43,6 +43,7 @@ class EventType(Enum):
 
 Host = collections.namedtuple("Host", ["instance_id", "hostname", "slots", "gpus"])
 UpdateEvent = collections.namedtuple("UpdateEvent", ["action", "message", "host"])
+INSTANCE_ALIVE_STATE = ["pending", "running"]
 
 
 def load_module(module):
@@ -389,3 +390,38 @@ def retrieve_max_cluster_size(region, proxy_config, asg_name, fallback):
         )
         log.critical(error_msg)
         raise CriticalError(error_msg)
+
+
+def get_cluster_instance_info(cluster_name, region, include_master=False):
+    """Return a dict of instance_id to nodename."""
+    try:
+        instances_in_cluster = {}
+        ec2_client = boto3.client("ec2", region_name=region)
+        nodes_to_include = ["Compute", "Master"] if include_master else ["Compute"]
+        next_token = None
+        while True:
+            function_args = {
+                "Filters": [
+                    {"Name": "tag:Application", "Values": [cluster_name]},
+                    {"Name": "tag:Name", "Values": nodes_to_include},
+                ],
+                "MaxResults": 1000,
+            }
+            if next_token:
+                function_args["NextToken"] = next_token
+            response = ec2_client.describe_instances(**function_args)
+            for reservation in response.get("Reservations"):
+                for instance in reservation.get("Instances"):
+                    is_alive = instance.get("State").get("Name") in INSTANCE_ALIVE_STATE
+                    instance_id = instance.get("InstanceId")
+                    hostname = instance.get("PrivateDnsName").split(".")[0]
+                    if is_alive:
+                        instances_in_cluster[instance_id] = hostname
+            next_token = response.get("NextToken")
+            if not next_token or next_token == "null":
+                break
+
+        return instances_in_cluster
+
+    except Exception as e:
+        logging.error("Failed retrieving instance_ids for cluster {} with exception: {}".format(cluster_name, e))
@@ -277,6 +277,11 @@ def _init_idletime():
 
 
 def _lock_and_terminate(region, proxy_config, scheduler_module, hostname, instance_id):
+    # handle case that the instance is placed in lock by scheduled event and has job running
+    if _has_jobs(scheduler_module, hostname):
+        log.info("Instance has active jobs.")
+        return
+    # handle case that instance has no job running to begin with
     _lock_host(scheduler_module, hostname)
     if _has_jobs(scheduler_module, hostname):
         log.info("Instance has active jobs.")
 
@@ -14,6 +14,7 @@
 import subprocess
 
 from common.schedulers.sge_commands import (
+    SGE_DISABLED_STATE,
     SGE_ERROR_STATES,
     SGE_HOLD_STATE,
     get_compute_nodes_info,
@@ -86,7 +87,11 @@ def is_node_down():
 
         node = nodes.get(host_fqdn, nodes.get(hostname))
         log.info("Node is in state: '{0}'".format(node.state))
+        # check if any error state is present
         if all(error_state not in node.state for error_state in SGE_ERROR_STATES):
+            # Consider the node down if it's in disabled state and there is no job running
+            if SGE_DISABLED_STATE in node.state and not has_jobs(hostname):
+                return True
             return False
     except Exception as e:
         log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)
 
@@ -12,8 +12,14 @@
 import logging
 import subprocess
 
-from common.schedulers.slurm_commands import PENDING_RESOURCES_REASONS, get_pending_jobs_info
-from common.utils import check_command_output, run_command
+from common.schedulers.slurm_commands import (
+    PENDING_RESOURCES_REASONS,
+    SLURM_NODE_ERROR_STATES,
+    get_node_state,
+    get_pending_jobs_info,
+    lock_node,
+)
+from common.utils import check_command_output
 
 log = logging.getLogger(__name__)
 
@@ -54,43 +60,16 @@ def has_pending_jobs(instance_properties, max_size):
 
 
 def lock_host(hostname, unlock=False):
-    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
-    hostname = hostname.split(".")[0]
-    if unlock:
-        log.info("Unlocking host %s", hostname)
-        command = [
-            "/opt/slurm/bin/scontrol",
-            "update",
-            "NodeName={0}".format(hostname),
-            "State=RESUME",
-            'Reason="Unlocking"',
-        ]
-    else:
-        log.info("Locking host %s", hostname)
-        command = [
-            "/opt/slurm/bin/scontrol",
-            "update",
-            "NodeName={0}".format(hostname),
-            "State=DRAIN",
-            'Reason="Shutting down"',
-        ]
-    try:
-        run_command(command)
-    except subprocess.CalledProcessError:
-        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
+    lock_node(hostname, unlock=unlock)
 
 
 def is_node_down():
     """Check if node is down according to scheduler."""
     try:
-        # retrieves the state of a specific node
-        # https://slurm.schedmd.com/sinfo.html#lbAG
-        # Output format:
-        # down*
-        command = "/bin/bash -c \"/opt/slurm/bin/sinfo --noheader -o '%T' -n $(hostname)\""
-        output = check_command_output(command).strip()
+        hostname = check_command_output("hostname").strip()
+        output = get_node_state(hostname)
         log.info("Node is in state: '{0}'".format(output))
-        if output and all(state not in output for state in ["down", "drained", "fail"]):
+        if output and all(state not in output for state in SLURM_NODE_ERROR_STATES):
             return False
     except Exception as e:
         log.error("Failed when checking if node is down with exception %s. Reporting node as down.", e)
 
@@ -10,18 +10,18 @@
 # limitations under the License.
 
 import logging
-import subprocess
 
 from common.schedulers.torque_commands import (
-    TORQUE_BIN_DIR,
+    TORQUE_NODE_DISABLED_STATE,
     TORQUE_NODE_ERROR_STATES,
     TORQUE_RUNNING_JOB_STATE,
     TORQUE_SUSPENDED_JOB_STATE,
     get_compute_nodes_info,
     get_jobs_info,
     get_pending_jobs_info,
+    lock_node,
 )
-from common.utils import check_command_output, run_command
+from common.utils import check_command_output
 
 log = logging.getLogger(__name__)
 
@@ -56,14 +56,7 @@ def has_pending_jobs(instance_properties, max_size):
 
 
 def lock_host(hostname, unlock=False):
-    # hostname format: ip-10-0-0-114.eu-west-1.compute.internal
-    hostname = hostname.split(".")[0]
-    mod = unlock and "-c" or "-o"
-    command = [TORQUE_BIN_DIR + "pbsnodes", mod, hostname]
-    try:
-        run_command(command)
-    except subprocess.CalledProcessError:
-        log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
+    lock_node(hostname, unlock=unlock)
 
 
 def is_node_down():
@@ -74,6 +67,10 @@ def is_node_down():
         if node:
             log.info("Node is in state: '{0}'".format(node.state))
             if all(error_state not in node.state for error_state in TORQUE_NODE_ERROR_STATES):
+                # Consider the node down if it is in Disabled state placed by scheduled event
+                # and does not have job
+                if TORQUE_NODE_DISABLED_STATE in node.state and not has_jobs(hostname):
+                    return True
                 return False
         else:
             log.warning("Node is not attached to scheduler. Reporting as down")
 
@@ -9,13 +9,17 @@
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import socket
 
 from common.schedulers.sge_commands import (
     QCONF_COMMANDS,
+    SGE_DISABLED_STATE,
     add_host_slots,
     add_hosts_to_group,
     exec_qconf_command,
+    get_compute_nodes_info,
     install_sge_on_compute_nodes,
+    lock_host,
     remove_hosts_from_group,
     remove_hosts_from_queue,
 )
@@ -99,5 +103,48 @@ def update_cluster(max_cluster_size, cluster_user, update_events, instance_prope
     return failed, succeeded
 
 
+def _is_node_locked(hostname):
+    node_info = get_compute_nodes_info(hostname_filter=hostname)
+    node = node_info.get(socket.getfqdn(hostname), node_info.get(hostname))
+    if SGE_DISABLED_STATE in node.state:
+        return True
+    return False
+
+
+def perform_health_actions(health_events):
+    """Update and write node lists( and gres_nodes if instance has GPU); restart relevant nodes."""
+    failed = []
+    succeeded = []
+    for event in health_events:
+        try:
+            # to-do, ignore fail to lock message if node is not in scheduler
+            if _is_node_locked(event.host.hostname):
+                log.error(
+                    "Instance {}/{} currently in disabled state 'd'. "
+                    "Risk of lock being released by nodewatcher if locking the node because of scheduled event now. "
+                    "Marking event as failed to retry later.".format(event.host.instance_id, event.host.hostname)
+                )
+                failed.append(event)
+                continue
+            lock_host(event.host.hostname)
+            if _is_node_locked:
+                succeeded.append(event)
+                log.info(
+                    "Successfully locked {} in response to scheduled maintainence event".format(event.host.hostname)
+                )
+            else:
+                failed.append(event)
+                log.info("Failed to lock {} in response to scheduled maintainence event".format(event.host.hostname))
+        except Exception as e:
+            log.error(
+                "Encountered exception when locking {} because of a scheduled maintainence event: {}".format(
+                    event.host.hostname, e
+                )
+            )
+            failed.append(event)
+
+    return failed, succeeded
+
+
 def init():
     pass