Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Handle systemctl when dbus not ready #4842

Merged
merged 2 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
89 changes: 46 additions & 43 deletions cloudinit/cmd/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,36 @@ class StatusDetails(NamedTuple):
{description}"""


def query_systemctl(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good utility function to generalize interaction with systemctl. While we see other call-sites throughout cloud-init to systemctl in cloudini/net/network_manager, cloudinit/net/activators.py, cloudinit/sources/helpers/azure.py and cloudinit/config/cc_mounts.py, these entry points are much later in boot than a potential early-boot invocation of cloud-init status from external consumers. So I don't necessarily think we "need" this logic elsewhere for this PR. It may be worth generalizing it in util or somewhere if we find another call-site that warrants early boot interaction and exposure to the socket not yet being responsive.

Copy link
Collaborator

@blackboxsw blackboxsw Feb 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation is very status specific as well. So I don't think it's worth generalizing further at this time.

systemctl_args: List[str],
*,
wait: bool,
existing_status: Optional[UXAppStatus] = None,
) -> str:
"""Query systemd with retries and return output."""
while True:
try:
return subp.subp(["systemctl", *systemctl_args]).stdout.strip()
except subp.ProcessExecutionError as e:
if existing_status and existing_status in (
UXAppStatus.DEGRADED_RUNNING,
UXAppStatus.RUNNING,
):
return ""
last_exception = e
if wait:
sleep(0.25)
else:
break
print(
"Failed to get status from systemd. "
"Cloud-init status may be inaccurate. ",
f"Error from systemctl: {last_exception.stderr}",
file=sys.stderr,
)
return ""


def get_parser(parser=None):
"""Build or extend an arg parser for status utility.

Expand Down Expand Up @@ -229,7 +259,9 @@ def handle_status_args(name, args) -> int:
return 0


def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
def get_bootstatus(
disable_file, paths, wait
) -> Tuple[UXAppBootStatusCode, str]:
"""Report whether cloud-init current boot status

@param disable_file: The path to the cloud-init disable file.
Expand All @@ -253,7 +285,7 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
elif "cloud-init=disabled" in os.environ.get("KERNEL_CMDLINE", "") or (
uses_systemd()
and "cloud-init=disabled"
in subp.subp(["systemctl", "show-environment"]).stdout
in query_systemctl(["show-environment"], wait=wait)
):
bootstatus_code = UXAppBootStatusCode.DISABLED_BY_ENV_VARIABLE
reason = (
Expand All @@ -272,7 +304,9 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]:
return (bootstatus_code, reason)


def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
def _get_error_or_running_from_systemd(
existing_status: UXAppStatus, wait: bool
) -> Optional[UXAppStatus]:
"""Get if systemd is in error or running state.

Using systemd, we can get more fine-grained status of the
Expand All @@ -288,14 +322,18 @@ def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
"cloud-init.service",
"cloud-init-local.service",
]:
stdout = subp.subp(
stdout = query_systemctl(
[
"systemctl",
"show",
"--property=ActiveState,UnitFileState,SubState,MainPID",
service,
],
).stdout
wait=wait,
existing_status=existing_status,
)
if not stdout:
# Systemd isn't ready
return None
states = dict(
[[x.strip() for x in r.split("=")] for r in stdout.splitlines()]
)
Expand Down Expand Up @@ -327,39 +365,6 @@ def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]:
return None


def _get_error_or_running_from_systemd_with_retry(
existing_status: UXAppStatus, *, wait: bool
) -> Optional[UXAppStatus]:
"""Get systemd status and retry if dbus isn't ready.

If cloud-init has determined that we're still running, then we can
ignore the status from systemd. However, if cloud-init has detected error,
then we should retry on systemd status so we don't incorrectly report
error state while cloud-init is still running.
"""
while True:
try:
return _get_error_or_running_from_systemd()
except subp.ProcessExecutionError as e:
last_exception = e
if existing_status in (
UXAppStatus.DEGRADED_RUNNING,
UXAppStatus.RUNNING,
):
return None
if wait:
sleep(0.25)
else:
break
print(
"Failed to get status from systemd. "
"Cloud-init status may be inaccurate. ",
f"Error from systemctl: {last_exception.stderr}",
file=sys.stderr,
)
return None


def get_status_details(
paths: Optional[Paths] = None, wait: bool = False
) -> StatusDetails:
Expand All @@ -380,7 +385,7 @@ def get_status_details(
result_file = os.path.join(paths.run_dir, "result.json")

boot_status_code, description = get_bootstatus(
CLOUDINIT_DISABLED_FILE, paths
CLOUDINIT_DISABLED_FILE, paths, wait
)
if boot_status_code in DISABLED_BOOT_CODES:
status = UXAppStatus.DISABLED
Expand Down Expand Up @@ -433,9 +438,7 @@ def get_status_details(
UXAppStatus.NOT_RUN,
UXAppStatus.DISABLED,
):
systemd_status = _get_error_or_running_from_systemd_with_retry(
status, wait=wait
)
systemd_status = _get_error_or_running_from_systemd(status, wait=wait)
if systemd_status:
status = systemd_status

Expand Down
99 changes: 74 additions & 25 deletions tests/unittests/cmd/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from cloudinit.cmd.status import (
UXAppStatus,
_get_error_or_running_from_systemd,
_get_error_or_running_from_systemd_with_retry,
)
from cloudinit.subp import SubpResult
from cloudinit.util import ensure_file
Expand Down Expand Up @@ -65,7 +64,7 @@ class TestStatus:
),
)
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
f"{M_PATH}_get_error_or_running_from_systemd",
return_value=None,
)
def test_get_status_details_ds_none(
Expand Down Expand Up @@ -188,6 +187,7 @@ def test_get_bootstatus(
failure_msg: str,
expected_reason: Union[str, Callable],
config: Config,
mocker,
):
if ensured_file is not None:
ensure_file(ensured_file(config))
Expand All @@ -201,15 +201,10 @@ def test_get_bootstatus(
stderr=None,
),
):
code, reason = wrap_and_call(
M_NAME,
{
"uses_systemd": uses_systemd,
"get_cmdline": get_cmdline,
},
status.get_bootstatus,
config.disable_file,
config.paths,
mocker.patch(f"{M_PATH}uses_systemd", return_value=uses_systemd)
mocker.patch(f"{M_PATH}get_cmdline", return_value=get_cmdline)
code, reason = status.get_bootstatus(
config.disable_file, config.paths, False
)
assert code == expected_bootstatus, failure_msg
if isinstance(expected_reason, str):
Expand Down Expand Up @@ -713,7 +708,7 @@ def fakeexists(filepath):
)
@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
f"{M_PATH}_get_error_or_running_from_systemd",
return_value=None,
)
def test_status_output(
Expand Down Expand Up @@ -757,7 +752,7 @@ def test_status_output(

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
f"{M_PATH}_get_error_or_running_from_systemd",
return_value=None,
)
def test_status_wait_blocks_until_done(
Expand Down Expand Up @@ -811,7 +806,7 @@ def fake_sleep(interval):

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
f"{M_PATH}_get_error_or_running_from_systemd",
return_value=None,
)
def test_status_wait_blocks_until_error(
Expand Down Expand Up @@ -867,7 +862,7 @@ def fake_sleep(interval):

@mock.patch(M_PATH + "read_cfg_paths")
@mock.patch(
f"{M_PATH}_get_error_or_running_from_systemd_with_retry",
f"{M_PATH}_get_error_or_running_from_systemd",
return_value=None,
)
def test_status_main(
Expand Down Expand Up @@ -948,7 +943,10 @@ def test_get_error_or_running_from_systemd(
stderr=None,
),
):
assert _get_error_or_running_from_systemd() == status
assert (
_get_error_or_running_from_systemd(UXAppStatus.RUNNING, False)
== status
)

def test_exception_while_running(self, mocker, capsys):
m_subp = mocker.patch(
Expand All @@ -959,9 +957,7 @@ def test_exception_while_running(self, mocker, capsys):
),
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.RUNNING, wait=True
)
_get_error_or_running_from_systemd(UXAppStatus.RUNNING, wait=True)
is None
)
assert 1 == m_subp.call_count
Expand All @@ -987,9 +983,7 @@ def test_retry(self, mocker, capsys):
],
)
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, wait=True
)
_get_error_or_running_from_systemd(UXAppStatus.ERROR, wait=True)
is UXAppStatus.RUNNING
)
assert 3 == m_subp.call_count
Expand All @@ -1005,13 +999,68 @@ def test_retry_no_wait(self, mocker, capsys):
)
mocker.patch("time.time", side_effect=[1, 2, 50])
assert (
_get_error_or_running_from_systemd_with_retry(
UXAppStatus.ERROR, wait=False
)
_get_error_or_running_from_systemd(UXAppStatus.ERROR, wait=False)
is None
)
assert 1 == m_subp.call_count
assert (
"Failed to get status from systemd. "
"Cloud-init status may be inaccurate."
) in capsys.readouterr().err


class TestQuerySystemctl:
def test_query_systemctl(self, mocker):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
return_value=SubpResult(stdout="hello", stderr=None),
)
assert status.query_systemctl(["some", "args"], wait=False) == "hello"
m_subp.assert_called_once_with(["systemctl", "some", "args"])

def test_query_systemctl_with_exception(self, mocker, capsys):
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected", stderr="oh noes!"
),
)
assert status.query_systemctl(["some", "args"], wait=False) == ""
m_subp.assert_called_once_with(["systemctl", "some", "args"])
assert "Error from systemctl: oh noes!" in capsys.readouterr().err

def test_query_systemctl_wait_with_exception(self, mocker):
m_sleep = mocker.patch(f"{M_PATH}sleep")
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=[
subp.ProcessExecutionError("Message recipient disconnected"),
subp.ProcessExecutionError("Message recipient disconnected"),
subp.ProcessExecutionError("Message recipient disconnected"),
SubpResult(stdout="hello", stderr=None),
],
)

assert status.query_systemctl(["some", "args"], wait=True) == "hello"
assert m_subp.call_count == 4
assert m_sleep.call_count == 3

def test_query_systemctl_wait_with_exception_status(self, mocker):
m_sleep = mocker.patch(f"{M_PATH}sleep")
m_subp = mocker.patch(
f"{M_PATH}subp.subp",
side_effect=subp.ProcessExecutionError(
"Message recipient disconnected"
),
)

assert (
status.query_systemctl(
["some", "args"],
wait=True,
existing_status=UXAppStatus.RUNNING,
)
== ""
)
assert m_subp.call_count == 1
assert m_sleep.call_count == 0