From 2de2c40b10c6c954c3d6ad7cbb52e02ca3314e09 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 28 May 2025 14:59:35 -0400 Subject: [PATCH] =?UTF-8?q?Revert=20"Fix=20a=20bug=20that=20was=20causing?= =?UTF-8?q?=20compute=20node=20self=E2=80=91termination=20to=20hang=20on?= =?UTF-8?q?=20Ubuntu=2024.04=20nodes"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 392739113d44ab8b10dda596890275fe2273b3f4. --- CHANGELOG.md | 4 ++-- src/slurm_plugin/computemgtd.py | 22 +----------------- tests/slurm_plugin/test_computemgtd.py | 32 +++----------------------- 3 files changed, 6 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3e8b7ea..b581e203e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,8 @@ This file is used to list changes made in each version of the aws-parallelcluste 3.13.1 ------ -**BUG FIXES** -- Fix a bug that was causing compute node self‑termination to hang on Ubuntu 24.04 nodes. +**CHANGES** +- There were no changes for this version. 3.13.0 ------ diff --git a/src/slurm_plugin/computemgtd.py b/src/slurm_plugin/computemgtd.py index 5c88a8916..3ae4f3261 100644 --- a/src/slurm_plugin/computemgtd.py +++ b/src/slurm_plugin/computemgtd.py @@ -125,34 +125,14 @@ def _read_nodename_from_file(nodename_file_path): raise -def _is_ubuntu2404(): - """Return True if the OS is Ubuntu 24.04.""" - try: - with open("/etc/os-release", "r") as f: - info = dict(line.strip().split("=", 1) for line in f if "=" in line) - os_id = info.get("ID", "").strip('"').lower() - version = info.get("VERSION_ID", "").strip('"') - return os_id == "ubuntu" and version.startswith("24.04") - except Exception as e: - log.warning("Unable to detect OS version from /etc/os-release: %s", e) - return False - - @log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False) def _self_terminate(): """Self terminate the instance.""" # Sleep for 10 seconds so termination log entries are uploaded to CW logs log.info("Preparing to self terminate the instance in 10 seconds!") time.sleep(10) - if _is_ubuntu2404(): - shutdown_cmd = "sudo systemctl poweroff --force" - log.info("Detected Ubuntu 24.04 – using `%s`", shutdown_cmd) - else: - shutdown_cmd = "sudo shutdown -h now" - log.info("Using default shutdown command `%s`", shutdown_cmd) - log.info("Self terminating instance now!") - run_command(shutdown_cmd) + run_command("sudo shutdown -h now") @retry(stop_max_attempt_number=3, wait_fixed=1500) diff --git a/tests/slurm_plugin/test_computemgtd.py b/tests/slurm_plugin/test_computemgtd.py index d28354211..ba9745609 100644 --- a/tests/slurm_plugin/test_computemgtd.py +++ b/tests/slurm_plugin/test_computemgtd.py @@ -12,12 +12,11 @@ import logging import os -from unittest.mock import mock_open import pytest import slurm_plugin from assertpy import assert_that -from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _is_ubuntu2404, _self_terminate +from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate from slurm_plugin.slurm_resources import DynamicNode @@ -104,38 +103,13 @@ def test_is_self_node_down(mock_node_info, expected_result, mocker): assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result) -@pytest.mark.parametrize( - ("is_ubuntu2404", "expected_cmd"), - [ - (True, "sudo systemctl poweroff --force"), - (False, "sudo shutdown -h now"), - ], -) -def test_self_terminate(mocker, caplog, is_ubuntu2404, expected_cmd): +def test_self_terminate(mocker, caplog): """Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances.""" - mocker.patch("slurm_plugin.computemgtd._is_ubuntu2404", return_value=is_ubuntu2404) run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command") sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep") with caplog.at_level(logging.INFO): _self_terminate() assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!") assert_that(caplog.text).contains("Self terminating instance now!") - run_command_patch.assert_called_with(expected_cmd) + run_command_patch.assert_called_with("sudo shutdown -h now") sleep_patch.assert_called_with(10) - - -@pytest.mark.parametrize( - ("file_content", "expected"), - [ - ('ID=ubuntu\nVERSION_ID="24.04"\n', True), - ('ID=ubuntu\nVERSION_ID="24.04.2"\n', True), - ('ID=ubuntu\nVERSION_ID="22.04"\n', False), - ('ID=rocky\nVERSION_ID="9.3"\n', False), - ("ID=ubuntu\n", False), - ], -) -def test_is_ubuntu2404(file_content, expected, mocker): - m = mock_open(read_data=file_content) - mocker.patch("builtins.open", m) - - assert _is_ubuntu2404() is expected