diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..2ee95c0c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,72 @@ +name: Bug report +description: Report a reproducible problem with Node Scraper +title: "[Bug]: " +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to report a bug. Please include enough detail to reproduce the issue. + - type: textarea + id: summary + attributes: + label: Summary + description: What happened and what did you expect to happen? + placeholder: Clear and concise description of the bug. + validations: + required: true + - type: textarea + id: steps + attributes: + label: Steps to reproduce + description: Minimal, concrete steps to reproduce the issue. + placeholder: | + 1. ... + 2. ... + 3. ... + validations: + required: true + - type: textarea + id: logs + attributes: + label: CLI output / logs + description: Paste relevant output (redact secrets/tokens/hosts as needed). + render: shell + validations: + required: false + - type: input + id: version + attributes: + label: Version + description: Output of `node-scraper --version` (or the git commit hash). + placeholder: "e.g. 0.6.1" + validations: + required: false + - type: dropdown + id: install_method + attributes: + label: Install method + options: + - PyPI (pip install amd-node-scraper) + - Editable install from source (pip install -e .) + - Other + validations: + required: false + - type: input + id: python + attributes: + label: Python version + placeholder: "e.g. 3.11.8" + validations: + required: false + - type: textarea + id: environment + attributes: + label: Environment + description: OS, target system type (LOCAL/REMOTE), and anything else relevant. + placeholder: | + OS: + Target: + Notes: + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..7a048e76 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Questions + url: https://github.com/amd/node-scraper/issues + about: If you're not sure it's a bug/feature request yet, start with an issue and add details. + - name: Security reports + url: https://github.com/amd/node-scraper/security/policy + about: Please report security issues privately (see SECURITY.md). diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..b9249856 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,38 @@ +name: Feature request +description: Suggest an enhancement or new capability +title: "[Feature]: " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + Thanks for the suggestion. Please describe the problem you're trying to solve and what success looks like. + - type: textarea + id: problem + attributes: + label: Problem / motivation + description: What problem are you trying to solve? + placeholder: "I want to..." + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: What would you like Node Scraper to do? + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: What else have you tried or considered? + validations: + required: false + - type: textarea + id: scope + attributes: + label: Scope / impact + description: What plugins/OSes/environments would this affect? + validations: + required: false diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..269ee465 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +## Summary +- + +## Test plan +- [ ] `pytest test/unit` +- [ ] `pytest test/functional` (if applicable) +- [ ] `pre-commit run --all-files` + +## Checklist +- [ ] Added/updated tests (or explained why not) +- [ ] Updated docs/README if behavior changed +- [ ] No secrets or credentials committed diff --git a/.github/scripts/plugin_convention_warnings.py b/.github/scripts/plugin_convention_warnings.py new file mode 100755 index 00000000..caaf03f6 --- /dev/null +++ b/.github/scripts/plugin_convention_warnings.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +"""Checks conventions under ``nodescraper/plugins`` (stderr warnings only; non-blocking). + +1. **Command strings in collectors/analyzers** , for ``Collector`` + or ``Analyzer`` classes: a *class-level* assignment to a string (or f-string) that + looks like a shell/CLI invocation must use the name ``CMD`` or + ``CMD_`` (e.g. ``CMD_LIST``). Names starting with ``_`` and names + listed in ``_CMD_CHECK_SKIP_NAMES`` are ignored; see + ``_looks_like_shell_command_literal`` for what counts as command-like. + +2. **Args models** — In ``collector_args.py`` and ``analyzer_args.py``, + for classes named ``*Args`` that subclass ``BaseModel``, ``CollectorArgs``, + ``AnalyzerArgs``, or another ``*Args``: each public field should assign + ``pydantic.Field(...)`` with a non-empty ``description=`` (for help/CLI + text). ``ClassVar`` fields, ``_``-prefixed names, and ``model_config`` are + skipped. +""" + +from __future__ import annotations + +import ast +import re +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +PLUGIN_ROOT = _REPO_ROOT / "nodescraper" / "plugins" + +# Class-level names in collectors/analyzers that are not shell-command strings. +_CMD_CHECK_SKIP_NAMES = frozenset( + { + "AMD_SMI_EXE", + "DATA_MODEL", + "SUPPORTED_OS_FAMILY", + "COLLECTOR", + "ANALYZER", + "COLLECTOR_ARGS", + "ANALYZER_ARGS", + "TYPE_CHECKING", + } +) + + +def _is_stringish(expr: ast.expr) -> bool: + if isinstance(expr, ast.Constant) and isinstance(expr.value, str): + return True + if isinstance(expr, ast.JoinedStr): + return True + return False + + +def _stringish_preview(expr: ast.expr) -> str | None: + """Best-effort static string for command-like heuristics (f-strings may be partial).""" + if isinstance(expr, ast.Constant) and isinstance(expr.value, str): + return expr.value + if isinstance(expr, ast.JoinedStr): + parts: list[str] = [] + for elt in expr.values: + if isinstance(elt, ast.Constant) and isinstance(elt.value, str): + parts.append(elt.value) + else: + parts.append("\x00") # dynamic segment + return "".join(parts) if parts else "" + return None + + +def _looks_like_shell_command_literal(s: str) -> bool: + """True if this class-level string is plausibly a shell/CLI invocation (not IDs, tokens, paths).""" + s = s.strip() + if not s: + return False + if re.fullmatch(r"0x[0-9a-fA-F]+", s): + return False + # OS / config tokens such as PRETTY_NAME, VERSION_ID + if re.fullmatch(r"[A-Z][A-Z0-9_]+", s): + return False + # Filenames / simple paths (no shell metacharacters) + if "." in s and not re.search(r"[\s|;&$`]", s): + return False + if re.search(r"[\s|;&$`<>]", s): + return True + # Typical one-word inband commands: uptime, sysctl, dmesg, amd-smi, etc. + if re.fullmatch(r"[a-z][a-z0-9_.-]*", s, flags=re.IGNORECASE): + return True + return False + + +def _base_name(node: ast.expr) -> str | None: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Subscript): + return _base_name(node.value) + if isinstance(node, ast.Attribute): + return node.attr + return None + + +def _is_collector_or_analyzer_class(cls: ast.ClassDef) -> bool: + return cls.name.endswith("Collector") or cls.name.endswith("Analyzer") + + +def _field_call_name(func: ast.expr) -> bool: + if isinstance(func, ast.Name) and func.id == "Field": + return True + if isinstance(func, ast.Attribute) and func.attr == "Field": + return True + return False + + +def _field_has_nonempty_description(call: ast.Call) -> bool: + for kw in call.keywords: + if kw.arg != "description" or kw.value is None: + continue + v = kw.value + if isinstance(v, ast.Constant) and isinstance(v.value, str) and v.value.strip(): + return True + return False + + +def _check_cmd_prefixes(path: Path, tree: ast.Module) -> list[str]: + """Rule #1: warn when a command-like class attr is not ``CMD`` / ``CMD_*``.""" + msgs: list[str] = [] + for node in tree.body: + # Keeps only classes whose names end with Collector or Analyzer (e.g. ProcessCollector, PcieAnalyzer). + if not isinstance(node, ast.ClassDef) or not _is_collector_or_analyzer_class(node): + continue + for stmt in node.body: + if not isinstance(stmt, ast.Assign) or len(stmt.targets) != 1: + continue + t = stmt.targets[0] + if not isinstance(t, ast.Name): + continue + name = t.id + if name.startswith("_") or name in _CMD_CHECK_SKIP_NAMES: + continue + if not _is_stringish(stmt.value): + continue + preview = _stringish_preview(stmt.value) + if preview is None or not _looks_like_shell_command_literal(preview): + continue + if name == "CMD" or name.startswith("CMD_"): + continue + msgs.append( + f"{path}:{stmt.lineno}: [{node.name}] command-like class attribute {name!r} " + "should be renamed to CMD or to start with CMD_." + ) + return msgs + + +def _is_args_class(cls: ast.ClassDef) -> bool: + if not cls.name.endswith("Args"): + return False + if not cls.bases: + return False + for b in cls.bases: + bn = _base_name(b) + if bn in ("BaseModel", "CollectorArgs", "AnalyzerArgs"): + return True + if bn and bn.endswith("Args"): + return True + return False + + +def _annotation_mentions_classvar(ann: ast.expr | None) -> bool: + if ann is None: + return False + if isinstance(ann, ast.Name) and ann.id == "ClassVar": + return True + if isinstance(ann, ast.Subscript): + return _annotation_mentions_classvar(ann.value) + if isinstance(ann, ast.Attribute) and ann.attr == "ClassVar": + return True + if isinstance(ann, ast.BinOp) and isinstance(ann.op, ast.BitOr): + return _annotation_mentions_classvar(ann.left) or _annotation_mentions_classvar(ann.right) + return False + + +def _check_args_fields(path: Path, tree: ast.Module) -> list[str]: + """Rule #2: warn when Args fields lack ``Field`` with a non-empty ``description``.""" + msgs: list[str] = [] + for node in tree.body: + if not isinstance(node, ast.ClassDef) or not _is_args_class(node): + continue + for stmt in node.body: + if isinstance(stmt, ast.AnnAssign): + if _annotation_mentions_classvar(stmt.annotation): + continue + if not isinstance(stmt.target, ast.Name): + continue + field_name = stmt.target.id + if field_name.startswith("_") or field_name in ("model_config",): + continue + if stmt.value is None: + msgs.append( + f"{path}:{stmt.lineno}: [{node.name}] {field_name}: " + "use Field(..., description='...') for every Args field." + ) + continue + if isinstance(stmt.value, ast.Call) and _field_call_name(stmt.value.func): + if not _field_has_nonempty_description(stmt.value): + msgs.append( + f"{path}:{stmt.lineno}: [{node.name}] {field_name}: " + "Field(...) must include a non-empty description= for help text." + ) + else: + msgs.append( + f"{path}:{stmt.lineno}: [{node.name}] {field_name}: " + "must assign pydantic Field(...) with description=." + ) + elif isinstance(stmt, ast.Assign) and len(stmt.targets) == 1: + t = stmt.targets[0] + if not isinstance(t, ast.Name): + continue + field_name = t.id + if field_name.startswith("_") or field_name in ("model_config",): + continue + val = stmt.value + if isinstance(val, ast.Call) and _field_call_name(val.func): + if not _field_has_nonempty_description(val): + msgs.append( + f"{path}:{stmt.lineno}: [{node.name}] {field_name}: " + "Field(...) must include a non-empty description= for help text." + ) + return msgs + + +def main() -> None: + if not PLUGIN_ROOT.is_dir(): + sys.stderr.write(f"warning: plugins directory not found: {PLUGIN_ROOT}\n") + return + + all_msgs: list[str] = [] + for path in sorted(PLUGIN_ROOT.rglob("*.py")): + rel = path.relative_to(_REPO_ROOT) + name = path.name + try: + src = path.read_text(encoding="utf-8") + tree = ast.parse(src, filename=str(path)) + except (OSError, SyntaxError) as e: + all_msgs.append(f"{rel}: could not parse: {e}") + continue + + if "collector" in name and name.endswith(".py"): + all_msgs.extend(_check_cmd_prefixes(rel, tree)) + if "analyzer" in name and name.endswith(".py"): + all_msgs.extend(_check_cmd_prefixes(rel, tree)) + + if name == "collector_args.py" or name == "analyzer_args.py": + all_msgs.extend(_check_args_fields(rel, tree)) + + if all_msgs: + sys.stderr.write("plugin convention warnings (commit not blocked):\n") + for m in all_msgs: + sys.stderr.write(f" WARNING: {m}\n") + else: + sys.stdout.write("Success: no plugin convention warnings.\n") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/update-plugin-docs.yml b/.github/workflows/update-plugin-docs.yml index d0b977a7..a4da869c 100644 --- a/.github/workflows/update-plugin-docs.yml +++ b/.github/workflows/update-plugin-docs.yml @@ -38,7 +38,8 @@ jobs: source venv/bin/activate python docs/generate_plugin_doc_bundle.py \ --package nodescraper.plugins.inband \ - --output docs/PLUGIN_DOC.md + --output docs/PLUGIN_DOC.md \ + --update-readme-help - name: Clean pre-commit cache run: | @@ -50,7 +51,7 @@ jobs: run: | source venv/bin/activate pre-commit install-hooks || true - pre-commit run --files docs/PLUGIN_DOC.md || true + pre-commit run --files docs/PLUGIN_DOC.md README.md || true - name: Create Pull Request uses: peter-evans/create-pull-request@v6 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03f93420..85a64e4f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,13 @@ repos: + - repo: local + hooks: + - id: plugin-convention-warnings + name: plugin convention warnings (non-blocking) + entry: python3 .github/scripts/plugin_convention_warnings.py + language: system + pass_filenames: false + always_run: true + verbose: true - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index de74bbae..da2ac19d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,3 +90,14 @@ You can also run hooks manually: # Run all hooks on all files pre-commit run --all-files ``` + +### Plugin conventions + +We follow a few plugin design conventions so that +generation and downstream doc tooling run cleanly—for example, naming +command strings on `*Collector` / `*Analyzer` classes as `CMD` or `CMD_*`, and +using `pydantic.Field(..., description=...)` on args models. The +`plugin-convention-warnings` hook in pre-commit runs +[`.github/scripts/plugin_convention_warnings.py`](.github/scripts/plugin_convention_warnings.py) +to flag violations (warnings only); read the script’s module docstring for the +exact rules. diff --git a/README.md b/README.md index 7cdfc78d..eda8dea4 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ system debug. ## Table of Contents - [Installation](#installation) + - [Install from PyPI](#install-from-pypi) - [Install From Source](#install-from-source) - [CLI Usage](#cli-usage) - [Execution Methods](#execution-methods) @@ -14,7 +15,6 @@ system debug. - ['run-plugins' sub command](#run-plugins-sub-command) - ['gen-plugin-config' sub command](#gen-plugin-config-sub-command) - ['compare-runs' subcommand](#compare-runs-subcommand) - - ['show-redfish-oem-allowable' subcommand](#show-redfish-oem-allowable-subcommand) - ['summary' sub command](#summary-sub-command) - [Configs](#configs) - [Global args](#global-args) @@ -25,6 +25,19 @@ system debug. invoked by collectors** -> See [docs/PLUGIN_DOC.md](docs/PLUGIN_DOC.md) ## Installation +### Install from PyPI +Node Scraper is published on [PyPI](https://pypi.org/project/amd-node-scraper/) as **amd-node-scraper**. Install it with Python 3.9 or newer: + +```sh +pip install amd-node-scraper +``` + +Use a virtual environment if you prefer. After installation, confirm the CLI is available: + +```sh +node-scraper --help +``` + ### Install From Source Node Scraper requires Python 3.9+ for installation. After cloning this repository, call dev-setup.sh script with 'source'. This script creates an editable install of Node Scraper in @@ -59,46 +72,70 @@ Sets up pre-commit hooks for code quality checks. On Debian/Ubuntu, you may need The Node Scraper CLI can be used to run Node Scraper plugins on a target system. The following CLI options are available: + ```sh -usage: node-scraper [-h] [--sys-name STRING] [--sys-location {LOCAL,REMOTE}] [--sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE}] [--sys-sku STRING] - [--sys-platform STRING] [--plugin-configs [STRING ...]] [--system-config STRING] [--connection-config STRING] [--log-path STRING] - [--log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] [--gen-reference-config] [--skip-sudo] - {summary,run-plugins,describe,gen-plugin-config} ... +usage: cli.py [-h] [--version] [--sys-name STRING] + [--sys-location {LOCAL,REMOTE}] + [--sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE}] + [--sys-sku STRING] [--sys-platform STRING] + [--plugin-configs [STRING ...]] [--system-config STRING] + [--connection-config STRING] [--log-path STRING] + [--log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] + [--no-console-log] [--gen-reference-config] [--skip-sudo] + {summary,run-plugins,describe,gen-plugin-config,compare-runs,show-redfish-oem-allowable} + ... node scraper CLI positional arguments: - {summary,run-plugins,describe,gen-plugin-config} + {summary,run-plugins,describe,gen-plugin-config,compare-runs,show-redfish-oem-allowable} Subcommands summary Generates summary csv file run-plugins Run a series of plugins describe Display details on a built-in config or plugin gen-plugin-config Generate a config for a plugin or list of plugins + compare-runs Compare datamodels from two run log directories + show-redfish-oem-allowable + Fetch OEM diagnostic allowable types from Redfish + LogService (for oem_diagnostic_types_allowable) options: -h, --help show this help message and exit - --sys-name STRING System name (default: ) + --version show program's version number and exit + --sys-name STRING System name (default: ) --sys-location {LOCAL,REMOTE} Location of target system (default: LOCAL) --sys-interaction-level {PASSIVE,INTERACTIVE,DISRUPTIVE} - Specify system interaction level, used to determine the type of actions that plugins can perform (default: INTERACTIVE) + Specify system interaction level, used to determine + the type of actions that plugins can perform (default: + INTERACTIVE) --sys-sku STRING Manually specify SKU of system (default: None) --sys-platform STRING Specify system platform (default: None) --plugin-configs [STRING ...] - built-in config names or paths to plugin config JSONs. Available built-in configs: AllPlugins, NodeStatus (default: None) + built-in config names or paths to plugin config JSONs. + Available built-in configs: NodeStatus, AllPlugins + (default: None) --system-config STRING Path to system config json (default: None) --connection-config STRING Path to connection config json (default: None) - --log-path STRING Specifies local path for node scraper logs, use 'None' to disable logging (default: .) + --log-path STRING Specifies local path for node scraper logs, use 'None' + to disable logging (default: .) --log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET} Change python log level (default: INFO) + --no-console-log Write logs only to nodescraper.log under the run + directory; do not print to stdout. If no run log + directory would be created (e.g. --log-path None), + uses ./scraper_logs__/ like the + default layout. (default: False) --gen-reference-config - Generate reference config from system. Writes to ./reference_config.json. (default: False) - --skip-sudo Skip plugins that require sudo permissions (default: False) - + Generate reference config from system. Writes to + ./reference_config.json. (default: False) + --skip-sudo Skip plugins that require sudo permissions (default: + False) ``` + ### Execution Methods @@ -501,6 +538,17 @@ A plugin config can be used to compare the system data against the config specif Built-in configs include **NodeStatus** (a subset of plugins) and **AllPlugins** (runs every registered plugin with default arguments—useful for generating a reference config from the full system). +**NodeStatus plus additional plugins** — built-in configs merge with plugins named after `run-plugins`. +Use **`--plugin-configs=`** (equals form): with a space +after `--plugin-configs`. See below for examples: +```sh +node-scraper --plugin-configs=NodeStatus run-plugins PciePlugin +``` + +```sh +node-scraper --log-path ./logs --plugin-configs=NodeStatus run-plugins PciePlugin +``` + Using a JSON file: ```sh node-scraper --plugin-configs plugin_config.json diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..89d7b1ba --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,34 @@ +# Security Policy + +## Supported Versions + +We provide security updates for the **latest released version** and for the **default branch**. + +| Version | Supported | +| ------- | ------------------ | +| Default branch (`main` / `development`) | :white_check_mark: | +| Latest PyPI release (`amd-node-scraper`) | :white_check_mark: | +| Older releases | :x: | + +## Reporting a Vulnerability + +Please **do not** open a public issue for security vulnerabilities. + +### Preferred: private vulnerability report (GitHub) + +If enabled for this repository, use GitHub’s private vulnerability reporting: + +- `https://github.com/amd/node-scraper/security/advisories/new` + +### What to include + +- A clear description of the issue and potential impact +- Steps to reproduce (or a proof-of-concept, if safe to share) +- The output of `node-scraper --version` (or a git commit hash) +- Any relevant configuration details (please redact secrets/credentials/hostnames/IPs) + +### What to expect + +- We will acknowledge receipt as soon as practical. +- If the report is accepted, we’ll work on a fix and coordinate disclosure timing. +- If the report is declined (e.g., not a security issue), we’ll explain why. diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 921045bc..31b7ff74 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -4,31 +4,31 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | -| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool
- `expected_gpu_processes`: Optional[int]
- `expected_max_power`: Optional[int]
- `expected_driver_version`: Optional[str]
- `expected_memory_partition_mode`: Optional[str]
- `expected_compute_partition_mode`: Optional[str]
- `expected_pldm_version`: Optional[str]
- `l0_to_recovery_count_error_threshold`: Optional[int]
- `l0_to_recovery_count_warning_threshold`: Optional[int]
- `vendorid_ep`: Optional[str]
- `vendorid_ep_vf`: Optional[str]
- `devid_ep`: Optional[str]
- `devid_ep_vf`: Optional[str]
- `sku_name`: Optional[str]
- `expected_xgmi_speed`: Optional[list[float]]
- `analysis_range_start`: Optional[datetime.datetime]
- `analysis_range_end`: Optional[datetime.datetime] | **Collection Args:**
- `cper_file_path`: Optional[str] | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | -| BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str]
- `regex_match`: bool | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | -| CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List]
- `banned_cmdline`: Union[str, List]
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig]
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | -| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor \| Measure-Object).Count"
lspci -d {vendorid_ep}: \| grep -i 'VGA\\|Display\\|3D' \| wc -l
powershell -Command "(wmic path win32_VideoController get name \| findstr AMD \| Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: \| grep -i 'Virtual Function' \| wc -l
powershell -Command "(Get-VMHostPartitionableGpu \| Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]]
- `gpu_count`: Optional[list[int]]
- `vf_count`: Optional[list[int]] | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | -| DimmPlugin | sh -c 'dmidecode -t 17 \| tr -s " " \| grep -v "Volatile\\|None\\|Module" \| grep Size' 2>/dev/null
dmidecode
wmic memorychip get Capacity | - | **Collection Args:**
- `skip_sudo`: bool | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - | -| DkmsPlugin | dkms status
dkms --version | **Analyzer Args:**
- `dkms_status`: Union[str, list]
- `dkms_version`: Union[str, list]
- `regex_match`: bool | - | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) | -| DmesgPlugin | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null \| grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' \|\| true | **Built-in Regexes:**
- Out of memory error: `(?:oom_kill_process.*)\|(?:Out of memory.*)`
- I/O Page Fault: `IO_PAGE_FAULT`
- Kernel Panic: `\bkernel panic\b.*`
- SQ Interrupt: `sq_intr`
- SRAM ECC: `sram_ecc.*`
- Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*`
- Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*`
- Real Time throttling activated: `sched: RT throttling activated.*`
- RCU preempt detected stalls: `rcu_preempt detected stalls.*`
- RCU preempt self-detected stall: `rcu_preempt self-detected stall.*`
- QCM fence timeout: `qcm fence wait loop timeout.*`
- General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...`
- Segmentation fault: `(?:segfault.*in .*\[)\|(?:[Ss]egmentation [Ff]au...`
- Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*`
- Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*`
- Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*`
- amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...`
- Page Fault: `page fault for address.*`
- Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)\|(Fata...`
- PCIe AER Error Status: `(pcieport [\w:.]+: AER: aer_status:[^\n]*(?:\n[...`
- PCIe AER Correctable Error Status: `(.*aer_cor_status: 0x[0-9a-fA-F]+, aer_cor_mask...`
- PCIe AER Uncorrectable Error Status: `(.*aer_uncor_status: 0x[0-9a-fA-F]+, aer_uncor_...`
- PCIe AER Uncorrectable Error Severity with TLP Header: `(.*aer_uncor_severity: 0x[0-9a-fA-F]+.*)(\n.*TL...`
- Failed to read journal file: `Failed to read journal file.*`
- Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*`
- ACPI BIOS Error: `ACPI BIOS Error`
- ACPI Error: `ACPI Error`
- Filesystem corrupted!: `EXT4-fs error \(device .*\):`
- Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)`
- PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...`
- RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...`
- GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}`
- Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...`
- RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- SGX Error: `x86/cpu: SGX disabled by BIOS`
- MMP Error: `Failed to load MMP firmware qat_4xxx_mmp.bin`
- GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...`
- RAS Poison Consumed: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- RAS Poison created: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- Bad page threshold exceeded: `(amdgpu: Saved bad pages (\d+) reaches threshol...`
- RAS Hardware Error: `Hardware error from APEI Generic Hardware Error...`
- Error Address: `Error Address.*(?:\s.*)`
- RAS EDR Event: `EDR: EDR event received`
- DPC Event: `DPC: .*`
- LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...`
- LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...`
- Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` | **Collection Args:**
- `collect_rotated_logs`: bool
- `skip_sudo`: bool
- `log_dmesg_data`: bool | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) | -| FabricsPlugin | lspci \| grep -i cassini
lsmod \| grep cxi
cxi_stat
ibstat
ibv_devinfo
ls -l /sys/class/infiniband/*/device/net
fi_info -p cxi
mst start
mst status -v
ip link show
ofed_info -s | - | - | [FabricsDataModel](#FabricsDataModel-Model) | [FabricsCollector](#Collector-Class-FabricsCollector) | - | -| JournalPlugin | journalctl --no-pager --system --output=short-iso
journalctl --no-pager --system --output=json | **Analyzer Args:**
- `check_priority`: Optional[int]
- `group`: bool | **Collection Args:**
- `boot`: Optional[int] | [JournalData](#JournalData-Model) | [JournalCollector](#Collector-Class-JournalCollector) | [JournalAnalyzer](#Data-Analyzer-Class-JournalAnalyzer) | -| KernelPlugin | sh -c 'uname -a'
sh -c 'cat /proc/sys/kernel/numa_balancing'
wmic os get Version /Value | **Analyzer Args:**
- `exp_kernel`: Union[str, list]
- `exp_numa`: Optional[int]
- `regex_match`: bool | - | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) | -| KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict]
- `regex_filter`: list[str] | - | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | -| MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float
- `memory_threshold`: str | - | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | -| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
ip route show
ip rule show
wget | - | **Collection Args:**
- `url`: Optional[str]
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | - | -| NicPlugin | - | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]]
- `performance_profile_expected`: str
- `support_rdma_disabled_values`: List[str]
- `pcie_relaxed_ordering_expected`: str
- `expected_qos_prio_map`: Optional[Dict[Any, Any]]
- `expected_qos_pfc_enabled`: Optional[int]
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]]
- `expected_qos_tc_bandwidth`: Optional[List[int]]
- `require_qos_consistent_across_adapters`: bool
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] | **Collection Args:**
- `commands`: Optional[List[str]]
- `use_sudo_niccli`: bool
- `use_sudo_nicctl`: bool | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) | +| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | +| BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | +| CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | +| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | +| DimmPlugin | sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null
dmidecode
wmic memorychip get Capacity | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running dmidecode or wmic for memory info. | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - | +| DkmsPlugin | dkms status
dkms --version | **Analyzer Args:**
- `dkms_status`: Union[str, list] — Expected dkms status string(s) to match (e.g. 'amd/1.0.0'). At least one of dkms_status or dkms_version required.
- `dkms_version`: Union[str, list] — Expected dkms version string(s) to match. At least one of dkms_status or dkms_version required.
- `regex_match`: bool — If True, match dkms_status and dkms_version as regex; otherwise exact match. | - | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) | +| DmesgPlugin | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null | grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' || true | **Built-in Regexes:**
- Out of memory error: `(?:oom_kill_process.*)|(?:Out of memory.*)`
- I/O Page Fault: `IO_PAGE_FAULT`
- Kernel Panic: `\bkernel panic\b.*`
- SQ Interrupt: `sq_intr`
- SRAM ECC: `sram_ecc.*`
- Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*`
- Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*`
- Real Time throttling activated: `sched: RT throttling activated.*`
- RCU preempt detected stalls: `rcu_preempt detected stalls.*`
- RCU preempt self-detected stall: `rcu_preempt self-detected stall.*`
- QCM fence timeout: `qcm fence wait loop timeout.*`
- General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...`
- Segmentation fault: `(?:segfault.*in .*\[)|(?:[Ss]egmentation [Ff]au...`
- Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*`
- Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*`
- Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*`
- amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...`
- Page Fault: `page fault for address.*`
- Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)|(Fata...`
- PCIe AER Error Status: `(pcieport [\w:.]+: AER: aer_status:[^\n]*(?:\n[...`
- PCIe AER Correctable Error Status: `(.*aer_cor_status: 0x[0-9a-fA-F]+, aer_cor_mask...`
- PCIe AER Uncorrectable Error Status: `(.*aer_uncor_status: 0x[0-9a-fA-F]+, aer_uncor_...`
- PCIe AER Uncorrectable Error Severity with TLP Header: `(.*aer_uncor_severity: 0x[0-9a-fA-F]+.*)(\n.*TL...`
- Failed to read journal file: `Failed to read journal file.*`
- Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*`
- ACPI BIOS Error: `ACPI BIOS Error`
- ACPI Error: `ACPI Error`
- Filesystem corrupted!: `EXT4-fs error \(device .*\):`
- Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)`
- PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...`
- RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...`
- GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}`
- Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...`
- RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- SGX Error: `x86/cpu: SGX disabled by BIOS`
- MMP Error: `Failed to load MMP firmware qat_4xxx_mmp.bin`
- GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...`
- RAS Poison Consumed: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- RAS Poison created: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- Bad page threshold exceeded: `(amdgpu: Saved bad pages (\d+) reaches threshol...`
- RAS Hardware Error: `Hardware error from APEI Generic Hardware Error...`
- Error Address: `Error Address.*(?:\s.*)`
- RAS EDR Event: `EDR: EDR event received`
- DPC Event: `DPC: .*`
- LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...`
- LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...`
- Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` | **Collection Args:**
- `collect_rotated_logs`: bool — If True, also collect rotated dmesg log files from /var/log/dmesg*.
- `skip_sudo`: bool — If True, do not use sudo when running dmesg or listing log files.
- `log_dmesg_data`: bool — If True, log the collected dmesg output in artifacts. | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) | +| FabricsPlugin | lspci | grep -i cassini
lsmod | grep cxi
cxi_stat
ibstat
ibv_devinfo
ls -l /sys/class/infiniband/*/device/net
fi_info -p cxi
mst start
mst status -v
ip link show
ofed_info -s | - | - | [FabricsDataModel](#FabricsDataModel-Model) | [FabricsCollector](#Collector-Class-FabricsCollector) | - | +| JournalPlugin | journalctl --no-pager --system --output=short-iso
journalctl --no-pager --system --output=json | **Analyzer Args:**
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for analysis (ISO format). Only events on or after this time are analyzed.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for analysis (ISO format). Only events before this time are analyzed.
- `check_priority`: Optional[int] — Check against journal log priority (0=emergency..7=debug). If an entry has priority <= check_priority, an ERROR event...
- `group`: bool — If True, group entries that have the same priority and message. | **Collection Args:**
- `boot`: Optional[int] — Optional boot ID to limit journal collection to a specific boot. | [JournalData](#JournalData-Model) | [JournalCollector](#Collector-Class-JournalCollector) | [JournalAnalyzer](#Data-Analyzer-Class-JournalAnalyzer) | +| KernelPlugin | sh -c 'uname -a'
sh -c 'cat /proc/sys/kernel/numa_balancing'
wmic os get Version /Value | **Analyzer Args:**
- `exp_kernel`: Union[str, list] — Expected kernel version string(s) to match (e.g. from uname -a).
- `exp_numa`: Optional[int] — Expected value for kernel.numa_balancing (e.g. 0 or 1).
- `regex_match`: bool — If True, match exp_kernel as regex; otherwise exact match. | - | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) | +| KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict] — Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match.
- `regex_filter`: list[str] — List of regex patterns to filter which collected modules are checked (default: amd). | - | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | +| MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float — Required free-memory ratio (0-1). Analysis fails if free/total < ratio.
- `memory_threshold`: str — Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient. | - | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | +| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) | +| NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) | | NvmePlugin | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name}
nvme list -o json | - | - | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - | -| OsPlugin | sh -c '( lsb_release -ds \|\| (cat /etc/*release \| grep PRETTY_NAME) \|\| uname -om ) 2>/dev/null \| head -n1'
cat /etc/*release \| grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list]
- `exact_match`: bool | - | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | -| PackagePlugin | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | **Analyzer Args:**
- `exp_package_ver`: Dict[str, Optional[str]]
- `regex_match`: bool
- `rocm_regex`: Optional[str]
- `enable_rocm_regex`: bool | - | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) | -| PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int
- `exp_width`: int
- `exp_sriov_count`: int
- `exp_gpu_count_override`: Optional[int]
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType]
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType]
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | -| ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int
- `max_cpu_usage`: float | **Collection Args:**
- `top_n_process`: int | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | +| OsPlugin | sh -c '( lsb_release -ds || (cat /etc/*release | grep PRETTY_NAME) || uname -om ) 2>/dev/null | head -n1'
cat /etc/*release | grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list] — Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release).
- `exact_match`: bool — If True, require exact match for exp_os; otherwise substring match. | - | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | +| PackagePlugin | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | **Analyzer Args:**
- `exp_package_ver`: Dict[str, Optional[str]] — Map package name -> expected version (None = any version). Checked against installed packages.
- `regex_match`: bool — If True, match package versions with regex; otherwise exact or prefix match.
- `rocm_regex`: Optional[str] — Optional regex to identify ROCm package version (used when enable_rocm_regex is True).
- `enable_rocm_regex`: bool — If True, use rocm_regex (or default pattern) to extract ROCm version for checks. | - | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) | +| PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | +| ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env \| grep -Ei 'rocm\|hsa\|hip\|mpi\|openmp\|ucx\|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* \| tail -1
ldconfig -p \| grep -i -E 'rocm'
grep . -r {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list]
- `exp_rocm_latest`: str
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] | **Collection Args:**
- `rocm_path`: str | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | -| StoragePlugin | sh -c 'df -lH -B1 \| grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | -| SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] | **Collection Args:**
- `paths`: list[str]
- `directory_paths`: list[str] | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | -| SysctlPlugin | sysctl -n | **Analyzer Args:**
- `exp_vm_swappiness`: Optional[int]
- `exp_vm_numa_balancing`: Optional[int]
- `exp_vm_oom_kill_allocating_task`: Optional[int]
- `exp_vm_compaction_proactiveness`: Optional[int]
- `exp_vm_compact_unevictable_allowed`: Optional[int]
- `exp_vm_extfrag_threshold`: Optional[int]
- `exp_vm_zone_reclaim_mode`: Optional[int]
- `exp_vm_dirty_background_ratio`: Optional[int]
- `exp_vm_dirty_ratio`: Optional[int]
- `exp_vm_dirty_writeback_centisecs`: Optional[int]
- `exp_kernel_numa_balancing`: Optional[int] | - | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) | -| SyslogPlugin | ls -1 /var/log/syslog* 2>/dev/null \| grep -E '^/var/log/syslog(\.[0-9]+(\.gz)?)?$' \|\| true | - | - | [SyslogData](#SyslogData-Model) | [SyslogCollector](#Collector-Class-SyslogCollector) | - | +| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -r {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | +| StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | +| SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | +| SysctlPlugin | sysctl -n | **Analyzer Args:**
- `exp_vm_swappiness`: Optional[int] — Expected vm.swappiness value.
- `exp_vm_numa_balancing`: Optional[int] — Expected vm.numa_balancing value.
- `exp_vm_oom_kill_allocating_task`: Optional[int] — Expected vm.oom_kill_allocating_task value.
- `exp_vm_compaction_proactiveness`: Optional[int] — Expected vm.compaction_proactiveness value.
- `exp_vm_compact_unevictable_allowed`: Optional[int] — Expected vm.compact_unevictable_allowed value.
- `exp_vm_extfrag_threshold`: Optional[int] — Expected vm.extfrag_threshold value.
- `exp_vm_zone_reclaim_mode`: Optional[int] — Expected vm.zone_reclaim_mode value.
- `exp_vm_dirty_background_ratio`: Optional[int] — Expected vm.dirty_background_ratio value.
- `exp_vm_dirty_ratio`: Optional[int] — Expected vm.dirty_ratio value.
- `exp_vm_dirty_writeback_centisecs`: Optional[int] — Expected vm.dirty_writeback_centisecs value.
- `exp_kernel_numa_balancing`: Optional[int] — Expected kernel.numa_balancing value. | - | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) | +| SyslogPlugin | ls -1 /var/log/syslog* 2>/dev/null | grep -E '^/var/log/syslog(\.[0-9]+(\.gz)?)?$' || true | - | - | [SyslogData](#SyslogData-Model) | [SyslogCollector](#Collector-Class-SyslogCollector) | - | | UptimePlugin | uptime | - | - | [UptimeDataModel](#UptimeDataModel-Model) | [UptimeCollector](#Collector-Class-UptimeCollector) | - | # Collectors @@ -439,10 +439,135 @@ Collect raw output from niccli (Broadcom) and nicctl (Pensando) commands. **Link to code**: [nic_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/nic/nic_collector.py) +### Class Variables + +- **CMD_NICCLI_VERSION**: `niccli --version` +- **CMD_NICCLI_LIST**: `niccli --list` +- **CMD_NICCLI_LIST_DEVICES**: `niccli --list_devices` +- **CMD_NICCLI_LIST_DEVICES_LEGACY**: `niccli --listdev` +- **CMD_NICCLI_DISCOVERY_LEGACY**: `['niccli --listdev', 'niccli --list']` +- **CMD_NICCLI_DISCOVERY_NEW**: `['niccli --list_devices', 'niccli --list']` +- **CMD_NICCLI_DISCOVERY**: `['niccli --listdev', 'niccli --list']` +- **CMD_NICCLI_DISCOVERY_ALL**: `frozenset({'niccli --list', 'niccli --list_devices', 'niccli --listdev'})` +- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_LEGACY**: `niccli -dev {device_num} nvm -getoption support_rdma -scope 0` +- **CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_LEGACY**: `niccli -dev {device_num} nvm -getoption performance_profile` +- **CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_LEGACY**: `niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering` +- **CMD_NICCLI_QOS_TEMPLATE_LEGACY**: `niccli -dev {device_num} getqos` +- **CMD_NICCLI_PER_DEVICE_LEGACY**: `[ + niccli -dev {device_num} nvm -getoption support_rdma -scope 0, + niccli -dev {device_num} nvm -getoption performance_profile, + niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering, + niccli -dev {device_num} getqos +]` +- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption support_rdma` +- **CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption performance_profile` +- **CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering` +- **CMD_NICCLI_QOS_TEMPLATE_NEW**: `niccli --dev {device_num} qos --ets --show` +- **CMD_NICCLI_PER_DEVICE_NEW**: `[ + niccli --dev {device_num} nvm --getoption support_rdma, + niccli --dev {device_num} nvm --getoption performance_profile, + niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering, + niccli --dev {device_num} qos --ets --show +]` +- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE**: `niccli -dev {device_num} nvm -getoption support_rdma -scope 0` +- **CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE**: `niccli -dev {device_num} nvm -getoption performance_profile` +- **CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE**: `niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering` +- **CMD_NICCLI_PER_DEVICE**: `[ + niccli -dev {device_num} nvm -getoption support_rdma -scope 0, + niccli -dev {device_num} nvm -getoption performance_profile, + niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering, + niccli -dev {device_num} getqos +]` +- **CMD_NICCTL_CARD_TEXT**: `nicctl show card` +- **CMD_NICCTL_GLOBAL**: `[ + nicctl --version, + nicctl show card flash partition --json, + nicctl show card interrupts --json, + nicctl show card logs --non-persistent, + nicctl show card logs --boot-fault, + nicctl show card logs --persistent, + nicctl show card profile --json, + nicctl show card time --json, + nicctl show card statistics packet-buffer summary --json, + nicctl show lif statistics --json, + nicctl show lif internal queue-to-ud-pinning, + nicctl show pipeline internal anomalies, + nicctl show pipeline internal rsq-ring, + nicctl show pipeline internal statistics memory, + nicctl show port fsm, + nicctl show port transceiver --json, + nicctl show port statistics --json, + nicctl show port internal mac, + nicctl show qos headroom --json, + nicctl show rdma queue --json, + nicctl show rdma queue-pair --detail --json, + nicctl show version firmware +]` +- **CMD_NICCTL_PER_CARD**: `['nicctl show dcqcn --card {card_id} --json', 'nicctl show card hardware-config --card {card_id}']` +- **CMD_NICCTL_LEGACY_TEXT**: `[ + nicctl show card, + nicctl show dcqcn, + nicctl show environment, + nicctl show lif, + nicctl show pcie ats, + nicctl show port, + nicctl show qos, + nicctl show rdma statistics, + nicctl show version host-software +]` + ### Provides Data NicDataModel +### Commands + +- niccli --listdev +- niccli --list +- niccli --list_devices +- niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering +- niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering +- niccli -dev {device_num} nvm -getoption performance_profile +- niccli --dev {device_num} nvm --getoption performance_profile +- niccli -dev {device_num} nvm -getoption support_rdma -scope 0 +- niccli -dev {device_num} getqos +- niccli --dev {device_num} nvm --getoption support_rdma +- niccli --dev {device_num} qos --ets --show +- niccli --version +- nicctl show card +- nicctl --version +- nicctl show card flash partition --json +- nicctl show card interrupts --json +- nicctl show card logs --non-persistent +- nicctl show card logs --boot-fault +- nicctl show card logs --persistent +- nicctl show card profile --json +- nicctl show card time --json +- nicctl show card statistics packet-buffer summary --json +- nicctl show lif statistics --json +- nicctl show lif internal queue-to-ud-pinning +- nicctl show pipeline internal anomalies +- nicctl show pipeline internal rsq-ring +- nicctl show pipeline internal statistics memory +- nicctl show port fsm +- nicctl show port transceiver --json +- nicctl show port statistics --json +- nicctl show port internal mac +- nicctl show qos headroom --json +- nicctl show rdma queue --json +- nicctl show rdma queue-pair --detail --json +- nicctl show version firmware +- nicctl show dcqcn +- nicctl show environment +- nicctl show lif +- nicctl show pcie ats +- nicctl show port +- nicctl show qos +- nicctl show rdma statistics +- nicctl show version host-software +- nicctl show dcqcn --card {card_id} --json +- nicctl show card hardware-config --card {card_id} + ## Collector Class NvmeCollector ### Description @@ -1463,6 +1588,31 @@ Check memory usage is within the maximum allowed used memory **Link to code**: [memory_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/memory/memory_analyzer.py) +## Data Analyzer Class NetworkAnalyzer + +### Description + +Check network statistics for errors (PFC and other network error counters). + +**Bases**: ['RegexAnalyzer'] + +**Link to code**: [network_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/network/network_analyzer.py) + +### Class Variables + +- **ERROR_REGEX**: `[ + regex=re.compile('^tx_pfc_frames$') message='tx_pfc_frames is non-zero' event_category= event_priority=, + regex=re.compile('^tx_pfc_ena_frames_pri\\d+$') message='tx_pfc_ena_frames_pri* is non-zero' event_category= event_priority=, + regex=re.compile('^pfc_pri\\d+_tx_transitions$') message='pfc_pri*_tx_transitions is non-zero' event_category= event_priority= +]` + +### Regex Patterns + +- **Built-in Regexes:** +- - tx_pfc_frames is non-zero: `^tx_pfc_frames$` +- - tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$` +- - pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$` + ## Data Analyzer Class NicAnalyzer ### Description @@ -1687,23 +1837,23 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **check_static_data**: `bool` -- **expected_gpu_processes**: `Optional[int]` -- **expected_max_power**: `Optional[int]` -- **expected_driver_version**: `Optional[str]` -- **expected_memory_partition_mode**: `Optional[str]` -- **expected_compute_partition_mode**: `Optional[str]` -- **expected_pldm_version**: `Optional[str]` -- **l0_to_recovery_count_error_threshold**: `Optional[int]` -- **l0_to_recovery_count_warning_threshold**: `Optional[int]` -- **vendorid_ep**: `Optional[str]` -- **vendorid_ep_vf**: `Optional[str]` -- **devid_ep**: `Optional[str]` -- **devid_ep_vf**: `Optional[str]` -- **sku_name**: `Optional[str]` -- **expected_xgmi_speed**: `Optional[list[float]]` -- **analysis_range_start**: `Optional[datetime.datetime]` -- **analysis_range_end**: `Optional[datetime.datetime]` +- **check_static_data**: `bool` — If True, run static data checks (e.g. driver version, partition mode). +- **expected_gpu_processes**: `Optional[int]` — Expected number of GPU processes. +- **expected_max_power**: `Optional[int]` — Expected maximum power value (e.g. watts). +- **expected_driver_version**: `Optional[str]` — Expected AMD driver version string. +- **expected_memory_partition_mode**: `Optional[str]` — Expected memory partition mode (e.g. sp3, dp). +- **expected_compute_partition_mode**: `Optional[str]` — Expected compute partition mode. +- **expected_firmware_versions**: `Optional[dict[str, str]]` — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE). +- **l0_to_recovery_count_error_threshold**: `Optional[int]` — L0-to-recovery count above which an error is raised. +- **l0_to_recovery_count_warning_threshold**: `Optional[int]` — L0-to-recovery count above which a warning is raised. +- **vendorid_ep**: `Optional[str]` — Expected endpoint vendor ID (e.g. for PCIe). +- **vendorid_ep_vf**: `Optional[str]` — Expected endpoint VF vendor ID. +- **devid_ep**: `Optional[str]` — Expected endpoint device ID. +- **devid_ep_vf**: `Optional[str]` — Expected endpoint VF device ID. +- **sku_name**: `Optional[str]` — Expected SKU name string for GPU. +- **expected_xgmi_speed**: `Optional[list[float]]` — Expected xGMI speed value(s) (e.g. link rate). +- **analysis_range_start**: `Optional[datetime.datetime]` — Start of time range for time-windowed analysis. +- **analysis_range_end**: `Optional[datetime.datetime]` — End of time range for time-windowed analysis. ## Analyzer Args Class BiosAnalyzerArgs @@ -1713,8 +1863,8 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **exp_bios_version**: `list[str]` -- **regex_match**: `bool` +- **exp_bios_version**: `list[str]` — Expected BIOS version(s) to match against collected value (str or list). +- **regex_match**: `bool` — If True, match exp_bios_version as regex; otherwise exact match. ## Analyzer Args Class CmdlineAnalyzerArgs @@ -1724,10 +1874,10 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **required_cmdline**: `Union[str, List]` -- **banned_cmdline**: `Union[str, List]` -- **os_overrides**: `Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig]` -- **platform_overrides**: `Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig]` +- **required_cmdline**: `Union[str, List]` — Command-line parameters that must be present (e.g. 'pci=bfsort'). +- **banned_cmdline**: `Union[str, List]` — Command-line parameters that must not be present. +- **os_overrides**: `Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig]` — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier). +- **platform_overrides**: `Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig]` — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). ## Analyzer Args Class DeviceEnumerationAnalyzerArgs @@ -1737,9 +1887,9 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **cpu_count**: `Optional[list[int]]` -- **gpu_count**: `Optional[list[int]]` -- **vf_count**: `Optional[list[int]]` +- **cpu_count**: `Optional[list[int]]` — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list. +- **gpu_count**: `Optional[list[int]]` — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list. +- **vf_count**: `Optional[list[int]]` — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. ## Analyzer Args Class DkmsAnalyzerArgs @@ -1749,9 +1899,9 @@ Check sysctl matches expected sysctl details ### Annotations / fields -- **dkms_status**: `Union[str, list]` -- **dkms_version**: `Union[str, list]` -- **regex_match**: `bool` +- **dkms_status**: `Union[str, list]` — Expected dkms status string(s) to match (e.g. 'amd/1.0.0'). At least one of dkms_status or dkms_version required. +- **dkms_version**: `Union[str, list]` — Expected dkms version string(s) to match. At least one of dkms_status or dkms_version required. +- **regex_match**: `bool` — If True, match dkms_status and dkms_version as regex; otherwise exact match. ## Analyzer Args Class JournalAnalyzerArgs @@ -1765,8 +1915,10 @@ Arguments for journal analyzer ### Annotations / fields -- **check_priority**: `Optional[int]` -- **group**: `bool` +- **analysis_range_start**: `Optional[datetime.datetime]` — Start of time range for analysis (ISO format). Only events on or after this time are analyzed. +- **analysis_range_end**: `Optional[datetime.datetime]` — End of time range for analysis (ISO format). Only events before this time are analyzed. +- **check_priority**: `Optional[int]` — Check against journal log priority (0=emergency..7=debug). If an entry has priority <= check_priority, an ERROR event is raised. +- **group**: `bool` — If True, group entries that have the same priority and message. ## Analyzer Args Class KernelAnalyzerArgs @@ -1776,9 +1928,9 @@ Arguments for journal analyzer ### Annotations / fields -- **exp_kernel**: `Union[str, list]` -- **exp_numa**: `Optional[int]` -- **regex_match**: `bool` +- **exp_kernel**: `Union[str, list]` — Expected kernel version string(s) to match (e.g. from uname -a). +- **exp_numa**: `Optional[int]` — Expected value for kernel.numa_balancing (e.g. 0 or 1). +- **regex_match**: `bool` — If True, match exp_kernel as regex; otherwise exact match. ## Analyzer Args Class KernelModuleAnalyzerArgs @@ -1788,8 +1940,8 @@ Arguments for journal analyzer ### Annotations / fields -- **kernel_modules**: `dict[str, dict]` -- **regex_filter**: `list[str]` +- **kernel_modules**: `dict[str, dict]` — Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match. +- **regex_filter**: `list[str]` — List of regex patterns to filter which collected modules are checked (default: amd). ## Analyzer Args Class MemoryAnalyzerArgs @@ -1799,8 +1951,22 @@ Arguments for journal analyzer ### Annotations / fields -- **ratio**: `float` -- **memory_threshold**: `str` +- **ratio**: `float` — Required free-memory ratio (0-1). Analysis fails if free/total < ratio. +- **memory_threshold**: `str` — Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient. + +## Analyzer Args Class NetworkAnalyzerArgs + +### Description + +Arguments for the network analyzer plugin. + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/network/analyzer_args.py) + +### Annotations / fields + +- **error_regex**: `Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType]` — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. ## Analyzer Args Class NicAnalyzerArgs @@ -1814,16 +1980,16 @@ Analyzer args for niccli/nicctl data, with expected_values keyed by canonical co ### Annotations / fields -- **expected_values**: `Optional[Dict[str, Dict[str, Any]]]` -- **performance_profile_expected**: `str` -- **support_rdma_disabled_values**: `List[str]` -- **pcie_relaxed_ordering_expected**: `str` -- **expected_qos_prio_map**: `Optional[Dict[Any, Any]]` -- **expected_qos_pfc_enabled**: `Optional[int]` -- **expected_qos_tsa_map**: `Optional[Dict[Any, Any]]` -- **expected_qos_tc_bandwidth**: `Optional[List[int]]` -- **require_qos_consistent_across_adapters**: `bool` -- **nicctl_log_error_regex**: `Optional[List[Dict[str, Any]]]` +- **expected_values**: `Optional[Dict[str, Dict[str, Any]]]` — Per-command expected checks keyed by canonical key (see command_to_canonical_key). +- **performance_profile_expected**: `str` — Expected Broadcom performance_profile value (case-insensitive). Default RoCE. +- **support_rdma_disabled_values**: `List[str]` — Values that indicate RDMA is not supported (case-insensitive). +- **pcie_relaxed_ordering_expected**: `str` — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Default enabled. +- **expected_qos_prio_map**: `Optional[Dict[Any, Any]]` — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set. +- **expected_qos_pfc_enabled**: `Optional[int]` — Expected PFC enabled value (0/1 or bitmask). Checked per device when set. +- **expected_qos_tsa_map**: `Optional[Dict[Any, Any]]` — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set. +- **expected_qos_tc_bandwidth**: `Optional[List[int]]` — Expected TC bandwidth percentages. Checked per device when set. +- **require_qos_consistent_across_adapters**: `bool` — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map. +- **nicctl_log_error_regex**: `Optional[List[Dict[str, Any]]]` — Optional list of error patterns for nicctl show card logs. ## Analyzer Args Class OsAnalyzerArgs @@ -1833,8 +1999,8 @@ Analyzer args for niccli/nicctl data, with expected_values keyed by canonical co ### Annotations / fields -- **exp_os**: `Union[str, list]` -- **exact_match**: `bool` +- **exp_os**: `Union[str, list]` — Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release). +- **exact_match**: `bool` — If True, require exact match for exp_os; otherwise substring match. ## Analyzer Args Class PackageAnalyzerArgs @@ -1844,10 +2010,10 @@ Analyzer args for niccli/nicctl data, with expected_values keyed by canonical co ### Annotations / fields -- **exp_package_ver**: `Dict[str, Optional[str]]` -- **regex_match**: `bool` -- **rocm_regex**: `Optional[str]` -- **enable_rocm_regex**: `bool` +- **exp_package_ver**: `Dict[str, Optional[str]]` — Map package name -> expected version (None = any version). Checked against installed packages. +- **regex_match**: `bool` — If True, match package versions with regex; otherwise exact or prefix match. +- **rocm_regex**: `Optional[str]` — Optional regex to identify ROCm package version (used when enable_rocm_regex is True). +- **enable_rocm_regex**: `bool` — If True, use rocm_regex (or default pattern) to extract ROCm version for checks. ## Analyzer Args Class PcieAnalyzerArgs @@ -1861,13 +2027,13 @@ Arguments for PCIe analyzer ### Annotations / fields -- **exp_speed**: `int` -- **exp_width**: `int` -- **exp_sriov_count**: `int` -- **exp_gpu_count_override**: `Optional[int]` -- **exp_max_payload_size**: `Union[Dict[int, int], int, NoneType]` -- **exp_max_rd_req_size**: `Union[Dict[int, int], int, NoneType]` -- **exp_ten_bit_tag_req_en**: `Union[Dict[int, int], int, NoneType]` +- **exp_speed**: `int` — Expected PCIe link speed (generation 1–5). +- **exp_width**: `int` — Expected PCIe link width in lanes (1–16). +- **exp_sriov_count**: `int` — Expected SR-IOV virtual function count. +- **exp_gpu_count_override**: `Optional[int]` — Override expected GPU count for validation. +- **exp_max_payload_size**: `Union[Dict[int, int], int, NoneType]` — Expected max payload size: int for all devices, or dict keyed by device ID. +- **exp_max_rd_req_size**: `Union[Dict[int, int], int, NoneType]` — Expected max read request size: int for all devices, or dict keyed by device ID. +- **exp_ten_bit_tag_req_en**: `Union[Dict[int, int], int, NoneType]` — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. ## Analyzer Args Class ProcessAnalyzerArgs @@ -1877,8 +2043,8 @@ Arguments for PCIe analyzer ### Annotations / fields -- **max_kfd_processes**: `int` -- **max_cpu_usage**: `float` +- **max_kfd_processes**: `int` — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check. +- **max_cpu_usage**: `float` — Maximum allowed CPU usage (percent) for process checks. ## Analyzer Args Class RocmAnalyzerArgs @@ -1888,9 +2054,9 @@ Arguments for PCIe analyzer ### Annotations / fields -- **exp_rocm**: `Union[str, list]` -- **exp_rocm_latest**: `str` -- **exp_rocm_sub_versions**: `dict[str, Union[str, list]]` +- **exp_rocm**: `Union[str, list]` — Expected ROCm version string(s) to match (e.g. from rocminfo). +- **exp_rocm_latest**: `str` — Expected 'latest' ROCm path or version string for versioned installs. +- **exp_rocm_sub_versions**: `dict[str, Union[str, list]]` — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. ## Analyzer Args Class SysSettingsAnalyzerArgs @@ -1907,7 +2073,7 @@ Sysfs settings for analysis via a list of checks (path, expected values, name). ### Annotations / fields -- **checks**: `Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]]` +- **checks**: `Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]]` — List of sysfs checks (path, expected values or pattern, display name). ## Analyzer Args Class SysctlAnalyzerArgs @@ -1917,14 +2083,14 @@ Sysfs settings for analysis via a list of checks (path, expected values, name). ### Annotations / fields -- **exp_vm_swappiness**: `Optional[int]` -- **exp_vm_numa_balancing**: `Optional[int]` -- **exp_vm_oom_kill_allocating_task**: `Optional[int]` -- **exp_vm_compaction_proactiveness**: `Optional[int]` -- **exp_vm_compact_unevictable_allowed**: `Optional[int]` -- **exp_vm_extfrag_threshold**: `Optional[int]` -- **exp_vm_zone_reclaim_mode**: `Optional[int]` -- **exp_vm_dirty_background_ratio**: `Optional[int]` -- **exp_vm_dirty_ratio**: `Optional[int]` -- **exp_vm_dirty_writeback_centisecs**: `Optional[int]` -- **exp_kernel_numa_balancing**: `Optional[int]` +- **exp_vm_swappiness**: `Optional[int]` — Expected vm.swappiness value. +- **exp_vm_numa_balancing**: `Optional[int]` — Expected vm.numa_balancing value. +- **exp_vm_oom_kill_allocating_task**: `Optional[int]` — Expected vm.oom_kill_allocating_task value. +- **exp_vm_compaction_proactiveness**: `Optional[int]` — Expected vm.compaction_proactiveness value. +- **exp_vm_compact_unevictable_allowed**: `Optional[int]` — Expected vm.compact_unevictable_allowed value. +- **exp_vm_extfrag_threshold**: `Optional[int]` — Expected vm.extfrag_threshold value. +- **exp_vm_zone_reclaim_mode**: `Optional[int]` — Expected vm.zone_reclaim_mode value. +- **exp_vm_dirty_background_ratio**: `Optional[int]` — Expected vm.dirty_background_ratio value. +- **exp_vm_dirty_ratio**: `Optional[int]` — Expected vm.dirty_ratio value. +- **exp_vm_dirty_writeback_centisecs**: `Optional[int]` — Expected vm.dirty_writeback_centisecs value. +- **exp_kernel_numa_balancing**: `Optional[int]` — Expected kernel.numa_balancing value. diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index 0c2c839b..b7676b6a 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -27,13 +27,17 @@ Usage python generate_plugin_doc_bundle.py \ --package /home/alexbara/node-scraper/nodescraper/plugins/inband \ - --output PLUGIN_DOC.md + --output PLUGIN_DOC.md \ + --update-readme-help + """ import argparse import importlib import inspect import os import pkgutil +import re +import subprocess import sys from pathlib import Path from typing import Any, Iterable, List, Optional, Type @@ -262,15 +266,32 @@ def extract_regexes_and_args_from_analyzer( elif isinstance(val, str): output.append(f"**{attr}**: `{val}`") - # Extract analyzer args if provided + # Extract analyzer args if provided (prefer model_fields for descriptions) if inspect.isclass(args_cls): - anns = get_attr(args_cls, "__annotations__", {}) or {} - if anns: + fields = get_attr(args_cls, "model_fields", None) + if fields and isinstance(fields, dict): output.append("**Analyzer Args:**") - for key, value in anns.items(): - # Format the type annotation - type_str = format_type_annotation(value) - output.append(f"- `{key}`: {type_str}") + for key in fields: + try: + finfo = fields[key] + ann = getattr(finfo, "annotation", None) + type_str = format_type_annotation(ann) if ann is not None else "Any" + line = f"- `{key}`: {type_str}" + desc = get_field_description( + finfo, for_table=True, model_cls=args_cls, field_name=key + ) + if desc: + line += f" — {desc}" + output.append(line) + except Exception: + pass + else: + anns = get_attr(args_cls, "__annotations__", {}) or {} + if anns: + output.append("**Analyzer Args:**") + for key, value in anns.items(): + type_str = format_type_annotation(value) + output.append(f"- `{key}`: {type_str}") return output @@ -289,7 +310,13 @@ def extract_collection_args_from_collector_args(args_cls: Optional[type]) -> Lis finfo = fields[key] ann = getattr(finfo, "annotation", None) type_str = format_type_annotation(ann) if ann is not None else "Any" - output.append(f"- `{key}`: {type_str}") + line = f"- `{key}`: {type_str}" + desc = get_field_description( + finfo, for_table=True, model_cls=args_cls, field_name=key + ) + if desc: + line += f" — {desc}" + output.append(line) except Exception: pass if not output: @@ -302,6 +329,15 @@ def extract_collection_args_from_collector_args(args_cls: Optional[type]) -> Lis return output +def escape_table_cell(s: str) -> str: + """Escape content for a markdown table cell so pipes and newlines don't break columns. + Use HTML entity for pipe so all markdown parsers treat it as content, not column separator. + """ + if not s: + return s + return s.replace("|", "|").replace("\n", " ").replace("\r", " ") + + def md_header(text: str, level: int = 2) -> str: return f"{'#' * level} {text}\n\n" @@ -335,11 +371,55 @@ def format_type_annotation(type_ann: Any) -> str: return type_str +def get_field_description( + finfo: Any, + for_table: bool = False, + max_len: Optional[int] = 120, + model_cls: Optional[Type] = None, + field_name: Optional[str] = None, +) -> Optional[str]: + """Get description from a Pydantic FieldInfo. If for_table, single-line and escape pipes. + Falls back to model JSON schema description when model_cls and field_name are provided. + """ + desc = getattr(finfo, "description", None) + if (not desc or not isinstance(desc, str)) and model_cls and field_name: + try: + schema = model_cls.model_json_schema() + desc = schema.get("properties", {}).get(field_name, {}).get("description") + except Exception: + pass + if not desc or not isinstance(desc, str): + return None + desc = desc.strip() + if not desc: + return None + if for_table: + desc = desc.replace("\n", " ").replace("|", "\\|") + if max_len and len(desc) > max_len: + desc = desc[: max_len - 3].rstrip() + "..." + return desc + + def annotations_for_model(model_cls: type) -> List[str]: anns = get_attr(model_cls, "__annotations__", {}) or {} return [f"**{k}**: `{format_type_annotation(v)}`" for k, v in anns.items()] +def format_class_var_value(val: Any) -> str: + """Stable string for docs. set/frozenset repr order depends on PYTHONHASHSEED.""" + if isinstance(val, frozenset): + if not val: + return "frozenset()" + items = sorted(val, key=lambda x: (type(x).__name__, repr(x))) + return "frozenset({" + ", ".join(repr(x) for x in items) + "})" + if isinstance(val, set): + if not val: + return "set()" + items = sorted(val, key=lambda x: (type(x).__name__, repr(x))) + return "{" + ", ".join(repr(x) for x in items) + "}" + return str(val) + + def class_vars_dump(cls: type, exclude: set) -> List[str]: ignore = {"abc_impl", "_abc_impl", "__abstractmethods__"} exclude = set(exclude) | ignore @@ -362,7 +442,7 @@ def class_vars_dump(cls: type, exclude: set) -> List[str]: else: out.append(f"**{name}**: `{val}`") else: - out.append(f"**{name}**: `{val}`") + out.append(f"**{name}**: `{format_class_var_value(val)}`") return out @@ -374,17 +454,9 @@ def generate_plugin_table_rows(plugins: List[type]) -> List[List[str]]: an = get_attr(p, "ANALYZER", None) args = get_attr(p, "ANALYZER_ARGS", None) collector_args_cls = get_attr(p, "COLLECTOR_ARGS", None) - cmds = [] + cmds: List[str] = [] if inspect.isclass(col): - cmds += extract_cmds_from_classvars(col) - seen = set() - uniq = [] - for c in cmds: - key = " ".join(c.split()) - if key not in seen: - seen.add(key) - uniq.append(c) - cmds = uniq + cmds = extract_cmds_from_classvars(col) # Extract regexes and args from analyzer regex_and_args = [] @@ -397,10 +469,10 @@ def generate_plugin_table_rows(plugins: List[type]) -> List[List[str]]: rows.append( [ p.__name__, - "
".join(cmds).replace("|", "\\|") if cmds else "-", - "
".join(regex_and_args).replace("|", "\\|") if regex_and_args else "-", + escape_table_cell("
".join(cmds)) if cmds else "-", + escape_table_cell("
".join(regex_and_args)) if regex_and_args else "-", ( - "
".join(collection_args_lines).replace("|", "\\|") + escape_table_cell("
".join(collection_args_lines)) if collection_args_lines else "-" ), @@ -440,16 +512,8 @@ def render_collector_section(col: type, link_base: str, rel_root: Optional[str]) dm = get_attr(col, "DATA_MODEL", None) s += md_header("Provides Data", 3) + (f"{dm.__name__}\n\n" if inspect.isclass(dm) else "-\n\n") - cmds = [] - cmds += extract_cmds_from_classvars(col) + cmds = extract_cmds_from_classvars(col) if cmds: - seen, uniq = set(), [] - for c in cmds: - key = " ".join(c.split()) - if key not in seen: - seen.add(key) - uniq.append(c) - cmds = uniq s += md_header("Commands", 3) + md_list(cmds) return s @@ -506,14 +570,80 @@ def render_analyzer_args_section(args_cls: type, link_base: str, rel_root: Optio _url = setup_link(args_cls, link_base, rel_root) s += md_kv("Link to code", f"[{Path(_url).name}]({_url})") - anns = get_attr(args_cls, "__annotations__", {}) or {} - if anns: - ann_items = [f"**{k}**: `{format_type_annotation(v)}`" for k, v in anns.items()] - s += md_header("Annotations / fields", 3) + md_list(ann_items) + fields = get_attr(args_cls, "model_fields", None) + if fields and isinstance(fields, dict): + ann_items = [] + for k in fields: + try: + finfo = fields[k] + ann = getattr(finfo, "annotation", None) + type_str = format_type_annotation(ann) if ann is not None else "Any" + item = f"**{k}**: `{type_str}`" + field_desc = get_field_description( + finfo, for_table=False, model_cls=args_cls, field_name=k + ) + if field_desc: + item += f" — {field_desc}" + ann_items.append(item) + except Exception: + ann = getattr(fields[k], "annotation", None) + ann_items.append( + f"**{k}**: `{format_type_annotation(ann) if ann is not None else 'Any'}`" + ) + if ann_items: + s += md_header("Annotations / fields", 3) + md_list(ann_items) + else: + anns = get_attr(args_cls, "__annotations__", {}) or {} + if anns: + ann_items = [f"**{k}**: `{format_type_annotation(v)}`" for k, v in anns.items()] + s += md_header("Annotations / fields", 3) + md_list(ann_items) return s +# Markers in README.md that bracket the node-scraper -h block (HTML comments, not rendered). +README_HELP_BLOCK_START = "" +README_HELP_BLOCK_END = "" + + +def update_readme_help(readme_path: Path) -> bool: + """ + Update the node-scraper -h output block in README.md. + The block must be wrapped with and . + """ + result = subprocess.run( + [sys.executable, "-m", "nodescraper.cli.cli", "-h"], + capture_output=True, + text=True, + cwd=readme_path.parent, + ) + if result.returncode != 0: + return False + help_text = result.stdout.strip() + # Redact hostname in --sys-name default so README does not show machine name + help_text = re.sub( + r"(--sys-name STRING\s+System name \(default: )\S+", + r"\g<1>)", + help_text, + ) + content = readme_path.read_text(encoding="utf-8") + start_idx = content.find(README_HELP_BLOCK_START) + end_idx = content.find(README_HELP_BLOCK_END) + if start_idx == -1 or end_idx == -1 or end_idx <= start_idx: + return False + # Replace the entire bracketed block (from start marker through end marker) + block_end = end_idx + len(README_HELP_BLOCK_END) + new_block = f"{README_HELP_BLOCK_START}\n```sh\n{help_text}\n```\n{README_HELP_BLOCK_END}" + new_content = content[:start_idx] + new_block + content[block_end:] + readme_path.write_text(new_content, encoding="utf-8") + return True + + def main(): + # Prefer loading plugins from repo root so Field descriptions are picked up + _repo_root = Path(__file__).resolve().parent.parent + if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + ap = argparse.ArgumentParser( description="Generate Plugin Table and detail sections with setup_link + rel-root." ) @@ -521,6 +651,16 @@ def main(): "--package", default=DEFAULT_ROOT_PACKAGE, help="Dotted package or filesystem path" ) ap.add_argument("--output", default="PLUGIN_DOC.md", help="Output Markdown file") + ap.add_argument( + "--update-readme-help", + action="store_true", + help="Update the node-scraper -h output block in README.md (run from repo root or with correct cwd)", + ) + ap.add_argument( + "--readme", + default=None, + help="Path to README.md (default: README.md in current working directory)", + ) args = ap.parse_args() root = args.package @@ -601,7 +741,26 @@ def all_subclasses(cls: Type) -> set[type]: for a in args_classes: out.append(render_analyzer_args_section(a, LINK_BASE_DEFAULT, REL_ROOT_DEFAULT)) - Path(args.output).write_text("".join(out), encoding="utf-8") + repo_root = Path(__file__).resolve().parent.parent + output_path = Path(args.output) + if not output_path.is_absolute(): + output_path = repo_root / output_path + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text("".join(out), encoding="utf-8") + + if args.update_readme_help: + readme_path = Path(args.readme) if args.readme else Path.cwd() / "README.md" + if not readme_path.is_file(): + readme_path = Path(__file__).resolve().parent.parent / "README.md" + if readme_path.is_file(): + if update_readme_help(readme_path): + print(f"Updated node-scraper -h block in {readme_path}") # noqa: T201 + else: + print(f"Could not find or update -h block in {readme_path}") # noqa: T201 + sys.exit(1) + else: + print(f"README not found: {readme_path}") # noqa: T201 + sys.exit(1) if __name__ == "__main__": diff --git a/nodescraper/base/regexanalyzer.py b/nodescraper/base/regexanalyzer.py index 4103c99d..a53267fa 100644 --- a/nodescraper/base/regexanalyzer.py +++ b/nodescraper/base/regexanalyzer.py @@ -35,6 +35,24 @@ from nodescraper.models.event import Event +def _coerce_event_priority_from_dict(value: Union[str, int, EventPriority]) -> EventPriority: + """Turn a string name, integer level, or already-coerced value into the canonical priority member. + + Args: + value: Member name (case-insensitive), numeric level, or same-type value passthrough. + + Returns: + Matching priority member for the configured level. + """ + if isinstance(value, EventPriority): + return value + if isinstance(value, int): + return EventPriority(value) + if isinstance(value, str): + return EventPriority[value.upper()] + raise TypeError(f"Invalid event_priority: {value!r}") + + class ErrorRegex(BaseModel): regex: re.Pattern message: str @@ -135,13 +153,13 @@ def _convert_and_extend_error_regex( if isinstance(item, ErrorRegex): converted_regex.append(item) elif isinstance(item, dict): - # Convert dict to ErrorRegex - item["regex"] = re.compile(item["regex"]) - if "event_category" in item: - item["event_category"] = EventCategory(item["event_category"]) - if "event_priority" in item: - item["event_priority"] = EventPriority(item["event_priority"]) - converted_regex.append(ErrorRegex(**item)) + d = dict(item) + d["regex"] = re.compile(d["regex"]) + if "event_category" in d: + d["event_category"] = EventCategory(d["event_category"]) + if "event_priority" in d: + d["event_priority"] = _coerce_event_priority_from_dict(d["event_priority"]) + converted_regex.append(ErrorRegex(**d)) return converted_regex + list(base_regex) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index f4e2fe86..f76393bf 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -161,6 +161,14 @@ def build_parser( help="Change python log level", ) + parser.add_argument( + "--no-console-log", + action="store_true", + help="Write logs only to nodescraper.log under the run directory; do not print to stdout. " + "If no run log directory would be created (e.g. --log-path None), uses ./scraper_logs__/ " + "like the default layout.", + ) + parser.add_argument( "--gen-reference-config", dest="reference_config", @@ -316,26 +324,36 @@ def build_parser( parser_builder = DynamicParserBuilder(plugin_subparser, plugin_class) model_type_map = parser_builder.build_plugin_parser() except Exception as e: - print(f"Exception building arg parsers for {plugin_name}: {str(e)}") # noqa: T201 + logging.getLogger(DEFAULT_LOGGER).error( + "Exception building arg parsers for %s: %s", plugin_name, e, exc_info=True + ) continue plugin_subparser_map[plugin_name] = (plugin_subparser, model_type_map) return parser, plugin_subparser_map -def setup_logger(log_level: str = "INFO", log_path: Optional[str] = None) -> logging.Logger: +def setup_logger( + log_level: str = "INFO", + log_path: Optional[str] = None, + *, + console: bool = True, +) -> logging.Logger: """set up root logger when using the CLI Args: log_level (str): log level to use log_path (Optional[str]): optional path to filesystem log location + console (bool): if False, omit the stdout StreamHandler (file-only when log_path is set) Returns: logging.Logger: logger intstance """ - log_level = getattr(logging, log_level, "INFO") + log_level_no = getattr(logging, log_level, logging.INFO) - handlers = [logging.StreamHandler(stream=sys.stdout)] + handlers: list[logging.Handler] = [] + if console: + handlers.append(logging.StreamHandler(stream=sys.stdout)) if log_path: log_file_name = os.path.join(log_path, "nodescraper.log") @@ -343,15 +361,18 @@ def setup_logger(log_level: str = "INFO", log_path: Optional[str] = None) -> log logging.FileHandler(filename=log_file_name, mode="wt", encoding="utf-8"), ) + if not handlers: + handlers.append(logging.NullHandler()) + logging.basicConfig( force=True, - level=log_level, + level=log_level_no, format="%(asctime)25s %(levelname)10s %(name)25s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S %Z", handlers=handlers, encoding="utf-8", ) - logging.root.setLevel(logging.INFO) + logging.root.setLevel(log_level_no) logging.getLogger("paramiko").setLevel(logging.ERROR) logger = logging.getLogger(DEFAULT_LOGGER) @@ -391,11 +412,7 @@ def main(arg_input: Optional[list[str]] = None): sname = system_info.name.lower().replace("-", "_").replace(".", "_") timestamp = datetime.datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p") - if parsed_args.log_path and parsed_args.subcmd not in [ - "gen-plugin-config", - "describe", - "compare-runs", - ]: + if parsed_args.log_path: log_path = os.path.join( parsed_args.log_path, f"scraper_logs_{sname}_{timestamp}", @@ -404,7 +421,16 @@ def main(arg_input: Optional[list[str]] = None): else: log_path = None - logger = setup_logger(parsed_args.log_level, log_path) + if parsed_args.no_console_log and not log_path: + base_dir = parsed_args.log_path if parsed_args.log_path else "." + log_path = os.path.join(base_dir, f"scraper_logs_{sname}_{timestamp}") + os.makedirs(log_path, exist_ok=True) + + logger = setup_logger( + parsed_args.log_level, + log_path, + console=not parsed_args.no_console_log, + ) if log_path: logger.info("Log path: %s", log_path) @@ -416,7 +442,12 @@ def main(arg_input: Optional[list[str]] = None): ) if parsed_args.subcmd == "summary": - generate_summary(parsed_args.search_path, parsed_args.output_path, logger) + generate_summary( + parsed_args.search_path, + parsed_args.output_path, + logger, + artifact_dir=log_path, + ) sys.exit(0) if parsed_args.subcmd == "describe": @@ -431,6 +462,7 @@ def main(arg_input: Optional[list[str]] = None): skip_plugins=getattr(parsed_args, "skip_plugins", None) or [], include_plugins=getattr(parsed_args, "include_plugins", None), truncate_message=not getattr(parsed_args, "dont_truncate", False), + artifact_dir=log_path, ) sys.exit(0) @@ -463,7 +495,7 @@ def main(arg_input: Optional[list[str]] = None): "Could not read OEMDiagnosticDataType@Redfish.AllowableValues from LogService" ) sys.exit(1) - print(json.dumps(allowable, indent=2)) # noqa: T201 + logger.info("%s", json.dumps(allowable, indent=2)) finally: conn.close() sys.exit(0) @@ -474,10 +506,8 @@ def main(arg_input: Optional[list[str]] = None): ref_config = generate_reference_config_from_logs( parsed_args.reference_config_from_logs, plugin_reg, logger ) - output_path = os.getcwd() - if parsed_args.output_path: - output_path = parsed_args.output_path - path = os.path.join(output_path, "reference_config.json") + out_dir = log_path if log_path else parsed_args.output_path + path = os.path.join(out_dir, "reference_config.json") try: with open(path, "w") as f: json.dump( @@ -490,7 +520,9 @@ def main(arg_input: Optional[list[str]] = None): logger.error(exp) sys.exit(0) - parse_gen_plugin_config(parsed_args, plugin_reg, config_reg, logger) + parse_gen_plugin_config( + parsed_args, plugin_reg, config_reg, logger, artifact_dir=log_path + ) parsed_plugin_args = {} for plugin, plugin_args in plugin_arg_map.items(): diff --git a/nodescraper/cli/compare_runs.py b/nodescraper/cli/compare_runs.py index fe7b8f76..acbb92ca 100644 --- a/nodescraper/cli/compare_runs.py +++ b/nodescraper/cli/compare_runs.py @@ -25,6 +25,7 @@ ############################################################################### import json import logging +import os import re import sys from pathlib import Path @@ -359,6 +360,7 @@ def run_compare_runs( include_plugins: Optional[Sequence[str]] = None, output_path: Optional[str] = None, truncate_message: bool = True, + artifact_dir: Optional[str] = None, ) -> None: """Compare datamodels from two run log directories and log results. @@ -369,8 +371,10 @@ def run_compare_runs( logger: Logger for output. skip_plugins: Optional list of plugin names to exclude from comparison. include_plugins: Optional list of plugin names to include; if set, only these are compared. - output_path: Optional path for full diff report; default is __diff.txt. + output_path: Optional path for full diff report; default is __diff.txt + in the current directory, or under artifact_dir when set. truncate_message: If True, truncate message text and show only first 3 errors; if False, show full text and all. + artifact_dir: When set and output_path is not, write the diff file inside this directory (e.g. CLI run log dir). """ p1 = Path(path1) p2 = Path(path2) @@ -482,11 +486,11 @@ def run_compare_runs( out_file = output_path if not out_file: - out_file = f"{Path(path1).name}_{Path(path2).name}_diff.txt" + basename = f"{Path(path1).name}_{Path(path2).name}_diff.txt" + out_file = os.path.join(artifact_dir, basename) if artifact_dir else basename full_report = _build_full_diff_report(path1, path2, data1, data2, all_plugins) Path(out_file).write_text(full_report, encoding="utf-8") logger.info("Full diff report written to: %s", out_file) table_summary = TableSummary(logger=logger) table_summary.collate_results(plugin_results=plugin_results, connection_results=[]) - print(f"Diff file written to {out_file}") # noqa: T201 diff --git a/nodescraper/cli/dynamicparserbuilder.py b/nodescraper/cli/dynamicparserbuilder.py index 0616a4ac..8c0c9c68 100644 --- a/nodescraper/cli/dynamicparserbuilder.py +++ b/nodescraper/cli/dynamicparserbuilder.py @@ -24,7 +24,15 @@ # ############################################################################### import argparse -from typing import Literal, Optional, Type, get_args, get_origin +from typing import ( + Annotated, + Literal, + Optional, + Type, + get_args, + get_origin, + get_type_hints, +) from pydantic import BaseModel @@ -35,6 +43,45 @@ from nodescraper.typeutils import TypeUtils +def _help_from_annotated(anno: object) -> str: + """Pull CLI help from ``Annotated[T, metadata...]`` (string or ``Field(description=...)``).""" + if anno is None or get_origin(anno) is not Annotated: + return "" + for meta in get_args(anno)[1:]: + if isinstance(meta, str): + return meta + desc = getattr(meta, "description", None) + if isinstance(desc, str) and desc.strip(): + return desc + return "" + + +def _get_run_arg_help(plugin_class: Type[PluginInterface], arg: str) -> str: + """Get help text for a run() parameter from typing.Annotated metadata on the parameter.""" + try: + run_obj = None + for cls in plugin_class.__mro__: + if "run" in cls.__dict__: + run_obj = cls.__dict__["run"] + break + if run_obj is None: + run_obj = plugin_class.run + run_fn = run_obj + if isinstance(run_obj, staticmethod): + run_fn = run_obj.__func__ + elif isinstance(run_obj, classmethod): + run_fn = run_obj.__func__ + raw = getattr(run_fn, "__annotations__", {}).get(arg) + text = _help_from_annotated(raw) + if text: + return text + hints = get_type_hints(run_fn, include_extras=True) + return _help_from_annotated(hints.get(arg)) + except Exception: + pass + return "" + + class DynamicParserBuilder: """Dynamically build an argparse parser based on function type annotations or pydantic model types""" @@ -69,7 +116,10 @@ def build_plugin_parser(self) -> dict: for model_arg in model_args: model_type_map[model_arg] = arg else: - self.add_argument(type_class_map, arg.replace("_", "-"), arg_data.required) + help_text = _get_run_arg_help(self.plugin_class, arg) + self.add_argument( + type_class_map, arg.replace("_", "-"), arg_data.required, help_text=help_text + ) return model_type_map @@ -118,6 +168,7 @@ def add_argument( arg_name: str, required: bool, annotation: Optional[Type] = None, + help_text: Optional[str] = None, ) -> None: """Add an argument to a parser with an appropriate type @@ -126,7 +177,9 @@ def add_argument( arg_name (str): argument name required (bool): whether or not the arg is required annotation (Optional[Type]): full type annotation for extracting Literal choices + help_text (Optional[str]): help text for the argument (shown in -h output). """ + add_kw = {} if help_text is None else {"help": help_text} # Check for Literal types and extract choices literal_choices = None if Literal in type_class_map and annotation: @@ -139,12 +192,23 @@ def add_argument( if list in type_class_map: type_class = type_class_map[list] + inner = type_class.inner_type + if inner is dict or get_origin(inner) is dict: + elt_type = dict_arg + metavar = META_VAR_MAP[dict] + elif inner is not None: + elt_type = inner + metavar = META_VAR_MAP.get(inner, "STRING") + else: + elt_type = str + metavar = "STRING" self.parser.add_argument( f"--{arg_name}", nargs="*", - type=type_class.inner_type if type_class.inner_type else str, + type=elt_type, required=required, - metavar=META_VAR_MAP.get(type_class.inner_type, "STRING"), + metavar=metavar, + **add_kw, ) elif bool in type_class_map: self.parser.add_argument( @@ -152,6 +216,7 @@ def add_argument( type=bool_arg, required=required, choices=[True, False], + **add_kw, ) elif Literal in type_class_map and literal_choices: # Add argument with choices for Literal types @@ -161,26 +226,47 @@ def add_argument( required=required, choices=literal_choices, metavar=f"{{{','.join(literal_choices)}}}", + **add_kw, ) elif float in type_class_map: self.parser.add_argument( - f"--{arg_name}", type=float, required=required, metavar=META_VAR_MAP[float] + f"--{arg_name}", + type=float, + required=required, + metavar=META_VAR_MAP[float], + **add_kw, ) elif int in type_class_map: self.parser.add_argument( - f"--{arg_name}", type=int, required=required, metavar=META_VAR_MAP[int] + f"--{arg_name}", + type=int, + required=required, + metavar=META_VAR_MAP[int], + **add_kw, ) elif str in type_class_map: self.parser.add_argument( - f"--{arg_name}", type=str, required=required, metavar=META_VAR_MAP[str] + f"--{arg_name}", + type=str, + required=required, + metavar=META_VAR_MAP[str], + **add_kw, ) elif dict in type_class_map or self.get_model_arg(type_class_map): self.parser.add_argument( - f"--{arg_name}", type=dict_arg, required=required, metavar=META_VAR_MAP[dict] + f"--{arg_name}", + type=dict_arg, + required=required, + metavar=META_VAR_MAP[dict], + **add_kw, ) else: self.parser.add_argument( - f"--{arg_name}", type=str, required=required, metavar=META_VAR_MAP[str] + f"--{arg_name}", + type=str, + required=required, + metavar=META_VAR_MAP[str], + **add_kw, ) def build_model_arg_parser(self, model: type[BaseModel], required: bool) -> list[str]: @@ -203,10 +289,21 @@ def build_model_arg_parser(self, model: type[BaseModel], required: bool) -> list if type(None) in type_class_map and len(attr_data.type_classes) == 1: continue - # Get the full annotation from the model field + # Get the full annotation and description from the model field field = model.model_fields.get(attr) annotation = field.annotation if field else None + help_text = None + if field is not None: + desc = getattr(field, "description", None) + if isinstance(desc, str) and desc.strip(): + help_text = desc.strip() - self.add_argument(type_class_map, attr.replace("_", "-"), required, annotation) + self.add_argument( + type_class_map, + attr.replace("_", "-"), + required, + annotation, + help_text=help_text, + ) return list(type_map.keys()) diff --git a/nodescraper/cli/helper.py b/nodescraper/cli/helper.py index 41e30ede..620c8f38 100644 --- a/nodescraper/cli/helper.py +++ b/nodescraper/cli/helper.py @@ -31,7 +31,7 @@ import os import sys from pathlib import Path -from typing import Optional, Tuple +from typing import Optional, Sequence, Tuple from pydantic import BaseModel @@ -187,6 +187,13 @@ def build_config( return config +def log_cli_text_block(logger: logging.Logger, lines: Sequence[str]) -> None: + """Emit user-facing multi-line text through logging (respects handlers / --no-console-log).""" + text = "\n".join(lines).rstrip("\n") + if text: + logger.info("%s", text) + + def parse_describe( parsed_args: argparse.Namespace, plugin_reg: PluginRegistry, @@ -202,15 +209,18 @@ def parse_describe( logger (logging.Logger): logger instance """ if not parsed_args.name: + out: list[str] = [] if parsed_args.type == "config": - print("Available built-in configs:") # noqa: T201 + out.append("Available built-in configs:") for name in config_reg.configs: - print(f" {name}") # noqa: T201 + out.append(f" {name}") elif parsed_args.type == "plugin": - print("Available plugins:") # noqa: T201 + out.append("Available plugins:") for name in plugin_reg.plugins: - print(f" {name}") # noqa: T201 - print(f"\nUsage: describe {parsed_args.type} ") # noqa: T201 + out.append(f" {name}") + out.append("") + out.append(f"Usage: describe {parsed_args.type} ") + log_cli_text_block(logger, out) sys.exit(0) if parsed_args.type == "config": @@ -218,19 +228,25 @@ def parse_describe( logger.error("No config found for name: %s", parsed_args.name) sys.exit(1) config_model = config_reg.configs[parsed_args.name] - print(f"Config Name: {parsed_args.name}") # noqa: T201 - print(f"Description: {getattr(config_model, 'desc', '')}") # noqa: T201 - print("Plugins:") # noqa: T201 + out = [ + f"Config Name: {parsed_args.name}", + f"Description: {getattr(config_model, 'desc', '')}", + "Plugins:", + ] for plugin in getattr(config_model, "plugins", []): - print(f"\t{plugin}") # noqa: T201 + out.append(f"\t{plugin}") + log_cli_text_block(logger, out) elif parsed_args.type == "plugin": if parsed_args.name not in plugin_reg.plugins: logger.error("No plugin found for name: %s", parsed_args.name) sys.exit(1) plugin_class = plugin_reg.plugins[parsed_args.name] - print(f"Plugin Name: {parsed_args.name}") # noqa: T201 - print(f"Description: {getattr(plugin_class, '__doc__', '')}") # noqa: T201 + out = [ + f"Plugin Name: {parsed_args.name}", + f"Description: {getattr(plugin_class, '__doc__', '')}", + ] + log_cli_text_block(logger, out) sys.exit(0) @@ -240,6 +256,7 @@ def parse_gen_plugin_config( plugin_reg: PluginRegistry, config_reg: ConfigRegistry, logger: logging.Logger, + artifact_dir: Optional[str] = None, ): """parse 'gen_plugin_config' cmd line argument @@ -248,6 +265,7 @@ def parse_gen_plugin_config( plugin_reg (PluginRegistry): plugin registry instance config_reg (ConfigRegistry): config registry instance logger (logging.Logger): logger instance + artifact_dir (Optional[str]): if set, write the config under this directory (CLI run log dir) """ try: config = build_config( @@ -256,7 +274,8 @@ def parse_gen_plugin_config( config.name = parsed_args.config_name.split(".")[0] config.desc = "Auto generated config" - output_path = os.path.join(parsed_args.output_path, parsed_args.config_name) + out_dir = artifact_dir if artifact_dir else parsed_args.output_path + output_path = os.path.join(out_dir, parsed_args.config_name) with open(output_path, "w", encoding="utf-8") as out_file: out_file.write(config.model_dump_json(indent=2)) @@ -398,7 +417,8 @@ def process_args( else: cur_plugin = None for arg in plugin_args: - if not arg.startswith("-") and "," in arg: + # Only split on commas before a plugin context is set (e.g. "P1,P2"). + if not arg.startswith("-") and "," in arg and cur_plugin is None: for potential_plugin in arg.split(","): potential_plugin = potential_plugin.strip() if potential_plugin in plugin_names: @@ -576,13 +596,19 @@ def dump_to_csv(all_rows: list, filename: str, fieldnames: list[str], logger: lo logger.info("Data written to csv file: %s", filename) -def generate_summary(search_path: str, output_path: Optional[str], logger: logging.Logger): +def generate_summary( + search_path: str, + output_path: Optional[str], + logger: logging.Logger, + artifact_dir: Optional[str] = None, +): """Concatenate csv files into 1 summary csv file Args: search_path (str): Path for previous runs - output_path (Optional[str]): Path for new summary csv file + output_path (Optional[str]): Directory for new summary.csv (ignored when artifact_dir is set) logger (logging.Logger): instance of logger + artifact_dir (Optional[str]): if set, write summary.csv under this directory (CLI run log dir) """ fieldnames = ["nodename", "plugin", "status", "timestamp", "message"] @@ -606,8 +632,6 @@ def generate_summary(search_path: str, output_path: Optional[str], logger: loggi logger.error("No data rows found in matched CSV files.") return - if not output_path: - output_path = os.getcwd() - - output_path = os.path.join(output_path, "summary.csv") - dump_to_csv(all_rows, output_path, fieldnames, logger) + base_dir = artifact_dir if artifact_dir else (output_path or os.getcwd()) + out_file = os.path.join(base_dir, "summary.csv") + dump_to_csv(all_rows, out_file, fieldnames, logger) diff --git a/nodescraper/connection/redfish/redfish_connection.py b/nodescraper/connection/redfish/redfish_connection.py index 8711ff4a..449b4edb 100644 --- a/nodescraper/connection/redfish/redfish_connection.py +++ b/nodescraper/connection/redfish/redfish_connection.py @@ -183,6 +183,18 @@ def run_get(self, path: Union[str, RedfishPath]) -> RedfishGetResult: status_code=None, ) + def copy(self) -> "RedfishConnection": + """Return a new connection with the same config and its own session (for concurrent use).""" + return RedfishConnection( + base_url=self.base_url, + username=self.username, + password=self.password, + timeout=self.timeout, + use_session_auth=self.use_session_auth, + verify_ssl=self.verify_ssl, + api_root=self.api_root, + ) + def get_service_root(self) -> dict[str, Any]: """GET service root (e.g. /redfish/v1/).""" return self.get(RedfishPath(self.api_root)) diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index 1f948529..ed632fb4 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -27,7 +27,9 @@ import logging import os from pathlib import Path -from typing import Any, Generic, Optional, Type, Union +from typing import Annotated, Any, Generic, Optional, Type, Union + +from pydantic import Field from nodescraper.enums import EventPriority, ExecutionStatus, SystemInteractionLevel from nodescraper.generictypes import TAnalyzeArg, TCollectArg, TDataModel @@ -297,14 +299,29 @@ def analyze( def run( self, - collection: bool = True, - analysis: bool = True, + collection: Annotated[ + bool, + "Run the collector (True) or skip it (False).", + ] = True, + analysis: Annotated[ + bool, + "Run the analyzer (True) or skip it (False).", + ] = True, max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, - system_interaction_level: Union[ - SystemInteractionLevel, str + system_interaction_level: Annotated[ + Union[SystemInteractionLevel, str], + "System interaction level (e.g. PASSIVE, INTERACTIVE, DISRUPTIVE).", ] = SystemInteractionLevel.INTERACTIVE, preserve_connection: bool = False, - data: Optional[Union[str, dict, TDataModel]] = None, + data: Annotated[ + Optional[Union[str, dict, TDataModel]], + Field( + description=( + "Path to pre-collected data" + "; use with --collection False to run the analyzer only." + ), + ), + ] = None, collection_args: Optional[Union[TCollectArg, dict]] = None, analysis_args: Optional[Union[TAnalyzeArg, dict]] = None, ) -> PluginResult: diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index df6a61e1..33cf2801 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -28,12 +28,28 @@ import re import uuid from enum import Enum -from typing import Optional, Union +from typing import Any, Optional, Union from pydantic import BaseModel, Field, field_serializer, field_validator from nodescraper.enums import EventPriority + +def _data_to_json_safe(obj: Any) -> Any: + """Recursively convert event data to JSON-serializable form (e.g. exceptions -> str).""" + if isinstance(obj, BaseException): + return str(obj) + if isinstance(obj, dict): + return {k: _data_to_json_safe(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_data_to_json_safe(v) for v in obj] + if isinstance(obj, (str, int, float, bool, type(None))): + return obj + if isinstance(obj, (Enum, datetime.datetime, uuid.UUID)): + return str(obj) + return str(obj) + + LOG_LEVEL_MAP = { logging.INFO: EventPriority.INFO, logging.WARNING: EventPriority.WARNING, @@ -127,6 +143,11 @@ def serialize_priority(self, priority: EventPriority, _info) -> str: """ return priority.name + @field_serializer("data") + def serialize_data(self, data: dict, _info) -> dict: + """Ensure data is JSON-serializable (e.g. convert nested exceptions to str).""" + return _data_to_json_safe(data) + @field_validator("data") @classmethod def validate_data(cls, data: dict) -> dict: diff --git a/nodescraper/models/taskresult.py b/nodescraper/models/taskresult.py index 3a4a2952..5615e464 100644 --- a/nodescraper/models/taskresult.py +++ b/nodescraper/models/taskresult.py @@ -209,6 +209,18 @@ def log_result(self, log_path: str) -> None: with open(event_log, "w", encoding="utf-8") as log_file: json.dump(all_events, log_file, indent=2) + @staticmethod + def _event_occurrence_count(event: Event) -> int: + """Occurrences represented by one event (RegexAnalyzer groups repeats in data['count']).""" + raw = event.data.get("count") + if raw is None: + return 1 + try: + n = int(raw) + except (TypeError, ValueError): + return 1 + return max(1, n) + def _get_event_summary(self) -> str: """Get summary string for events @@ -219,12 +231,13 @@ def _get_event_summary(self) -> str: warning_msg_counts: dict[str, int] = {} for event in self.events: + n = self._event_occurrence_count(event) if event.priority == EventPriority.WARNING: warning_msg_counts[event.description] = ( - warning_msg_counts.get(event.description, 0) + 1 + warning_msg_counts.get(event.description, 0) + n ) elif event.priority >= EventPriority.ERROR: - error_msg_counts[event.description] = error_msg_counts.get(event.description, 0) + 1 + error_msg_counts[event.description] = error_msg_counts.get(event.description, 0) + n summary_parts = [] diff --git a/nodescraper/models/timerangeargs.py b/nodescraper/models/timerangeargs.py index 33b1400f..08789a7b 100644 --- a/nodescraper/models/timerangeargs.py +++ b/nodescraper/models/timerangeargs.py @@ -26,6 +26,8 @@ import datetime from typing import Optional +from pydantic import Field + from nodescraper.models import AnalyzerArgs @@ -34,5 +36,11 @@ class TimeRangeAnalysisArgs(AnalyzerArgs): Model for time range analysis arguments. """ - analysis_range_start: Optional[datetime.datetime] = None - analysis_range_end: Optional[datetime.datetime] = None + analysis_range_start: Optional[datetime.datetime] = Field( + default=None, + description="Start of time range for analysis (ISO format). Only events on or after this time are analyzed.", + ) + analysis_range_end: Optional[datetime.datetime] = Field( + default=None, + description="End of time range for analysis (ISO format). Only events before this time are analyzed.", + ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index ffe86cd7..9a9cea71 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -534,18 +534,14 @@ def _format_static_mismatch_payload( "per_gpu": per_gpu_list, } - def check_pldm_version( + def check_firmware_versions( self, amdsmi_fw_data: Optional[list[Fw]], - expected_pldm_version: Optional[str], - ): - """Check expected pldm version - - Args: - amdsmi_fw_data (Optional[list[Fw]]): data model - expected_pldm_version (Optional[str]): expected pldm version - """ - PLDM_STRING = "PLDM_BUNDLE" + expected_firmware_versions: dict[str, str], + ) -> None: + """Check that each GPU reports the expected version for each ``fw_id``.""" + if not expected_firmware_versions: + return if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: self._log_event( category=EventCategory.PLATFORM, @@ -554,30 +550,37 @@ def check_pldm_version( data={"amdsmi_fw_data": amdsmi_fw_data}, ) return - mismatched_gpus: list[int] = [] - pldm_missing_gpus: list[int] = [] + mismatches: list[dict[str, object]] = [] + missing: list[dict[str, object]] = [] for fw_data in amdsmi_fw_data: gpu = fw_data.gpu if isinstance(fw_data.fw_list, str): - pldm_missing_gpus.append(gpu) + for fw_id in expected_firmware_versions: + missing.append({"gpu": gpu, "fw_id": fw_id}) continue - for fw_info in fw_data.fw_list: - if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: - mismatched_gpus.append(gpu) - if PLDM_STRING == fw_info.fw_id: - break - else: - pldm_missing_gpus.append(gpu) + actual_by_id = {item.fw_id: item.fw_version for item in fw_data.fw_list} + for fw_id, expected_ver in expected_firmware_versions.items(): + if fw_id not in actual_by_id: + missing.append({"gpu": gpu, "fw_id": fw_id}) + elif actual_by_id[fw_id] != expected_ver: + mismatches.append( + { + "gpu": gpu, + "fw_id": fw_id, + "expected": expected_ver, + "actual": actual_by_id[fw_id], + } + ) - if mismatched_gpus or pldm_missing_gpus: + if mismatches or missing: self._log_event( category=EventCategory.FW, - description="PLDM Version Mismatch", + description="Firmware version mismatch", priority=EventPriority.ERROR, data={ - "mismatched_gpus": mismatched_gpus, - "pldm_missing_gpus": pldm_missing_gpus, - "expected_pldm_version": expected_pldm_version, + "expected_firmware_versions": expected_firmware_versions, + "mismatches": mismatches, + "missing": missing, }, ) @@ -661,14 +664,12 @@ def check_expected_xgmi_link_speed( if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0: self._log_event( category=EventCategory.IO, - description="Expected XGMI speed not configured, skipping XGMI link speed check", - priority=EventPriority.WARNING, + description=("Expected XGMI link speed not set; skipping XGMI link speed analysis"), + priority=EventPriority.INFO, + console_log=True, ) return - expected_str = ", ".join(str(s) for s in expected_xgmi_speed) - mismatches: list[dict] = [] - for xgmi_data in xgmi_metric: link_metric = xgmi_data.link_metrics try: @@ -687,7 +688,7 @@ def check_expected_xgmi_link_speed( continue xgmi_float = float(link_metric.bit_rate.value) - except ValueError: + except (ValueError, TypeError): self._log_event( category=EventCategory.IO, description="XGMI link speed is not a valid number", @@ -701,31 +702,19 @@ def check_expected_xgmi_link_speed( ) continue - if xgmi_float not in expected_xgmi_speed: - mismatches.append( - { + expected_floats = [float(e) for e in expected_xgmi_speed] + if xgmi_float not in expected_floats: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not as expected", + priority=EventPriority.ERROR, + data={ + "expected_xgmi_speed": expected_xgmi_speed, + "xgmi_bit_rate": xgmi_float, "gpu": xgmi_data.gpu, - "actual_gt_s": xgmi_float, - "expected_gt_s": expected_str, - } + }, ) - if mismatches: - details = "; ".join( - f"GPU {m['gpu']} {m['actual_gt_s']} GT/s (expected {m['expected_gt_s']})" - for m in mismatches - ) - self._log_event( - category=EventCategory.IO, - description=f"XGMI link speed is not as expected: {details}", - priority=EventPriority.ERROR, - data={ - "expected_gt_s": expected_str, - "mismatches": mismatches, - }, - console_log=True, - ) - def analyze_data( self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None ) -> TaskResult: @@ -793,8 +782,8 @@ def analyze_data( args.expected_compute_partition_mode, ) - if args.expected_pldm_version: - self.check_pldm_version(data.firmware, args.expected_pldm_version) + if args.expected_firmware_versions: + self.check_firmware_versions(data.firmware, args.expected_firmware_versions) if data.cper_data: self.analyzer_cpers( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 860c0e0f..d4f22c46 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -475,7 +475,8 @@ def _get_amdsmi_data( return None try: - return AmdSmiDataModel( + fw_ids = args.analysis_firmware_ids if args and args.analysis_firmware_ids else None + base = AmdSmiDataModel( version=version, gpu_list=gpu_list, process=processes, @@ -489,7 +490,10 @@ def _get_amdsmi_data( xgmi_link=xgmi_link or [], cper_data=cper_data, cper_afids=cper_afids, + analysis_firmware_ids=fw_ids, + analysis_ref=None, ) + return base.model_copy(update={"analysis_ref": base.build_analysis_ref()}) except ValidationError as err: self.logger.warning("Validation err: %s", err) self._log_event( @@ -763,7 +767,9 @@ def get_firmware(self) -> Optional[list[Fw]]: normalized: list[FwListItem] = [] for e in fw_list_raw: if isinstance(e, dict): - fid = e.get("fw_name") + fid = e.get("fw_id") + if fid is None: + fid = e.get("fw_name") ver = e.get("fw_version") normalized.append( FwListItem( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 04ff545f..940047ba 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -927,6 +927,24 @@ class Topo(BaseModel): links: list[TopoLink] +class AmdSmiAnalysisRef(BaseModel): + """Collector-filled summary for reference config""" + + gpu_processes_max: Optional[int] = None + max_power_w: Optional[int] = None + amdgpu_drv_version: Optional[str] = None + mem_part_mode: Optional[str] = None + compute_part_mode: Optional[str] = None + firmware_versions: Optional[dict[str, str]] = None + pldm_version: Optional[str] = None + ep_vendor_id: Optional[str] = None + ep_subvendor_id: Optional[str] = None + ep_device_id: Optional[str] = None + ep_subsystem_id: Optional[str] = None + ep_market_name: Optional[str] = None + xgmi_rates: Optional[list[float]] = None + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -957,6 +975,13 @@ class AmdSmiDataModel(DataModel): cper_data: Optional[list[FileModel]] = Field(default_factory=list) cper_afids: dict[str, int] = Field(default_factory=dict) + analysis_firmware_ids: Optional[list[str]] = Field( + default=None, + description="fw_id values used when snapshotting firmware_versions into analysis_ref.", + ) + + analysis_ref: Optional[AmdSmiAnalysisRef] = None + def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: """Get the gpu list item for the given gpu id.""" if self.gpu_list is None: @@ -1001,3 +1026,154 @@ def get_bad_pages(self, gpu: int) -> Optional[BadPages]: if item.gpu == gpu: return item return None + + def _sorted_static_gpus(self) -> list[AmdSmiStatic]: + return sorted(self.static or [], key=lambda s: s.gpu) + + @property + def ref_gpu_processes_max(self) -> Optional[int]: + """Max process-list length across GPUs (for analysis reference snapshot).""" + proc = self.process + if not proc: + return None + counts: list[int] = [] + for p in proc: + if not p.process_list: + continue + if isinstance(p.process_list[0].process_info, str): + continue + counts.append(len(p.process_list)) + return max(counts) if counts else None + + @property + def ref_max_power_w(self) -> Optional[int]: + """First available max power limit (W) from static data, lowest GPU index first.""" + for gpu in self._sorted_static_gpus(): + lim = gpu.limit + if lim is None or lim.max_power is None or lim.max_power.value is None: + continue + try: + return int(float(lim.max_power.value)) + except (TypeError, ValueError): + continue + return None + + @property + def ref_amdgpu_drv_version(self) -> Optional[str]: + """Driver version from the lowest-index GPU with static data.""" + for gpu in self._sorted_static_gpus(): + if gpu.driver and gpu.driver.version: + return gpu.driver.version + return None + + @property + def ref_mem_part_mode(self) -> Optional[str]: + if self.partition is None: + return None + mps = self.partition.memory_partition + if not mps: + return None + return sorted(mps, key=lambda p: p.gpu_id)[0].partition_type + + @property + def ref_compute_part_mode(self) -> Optional[str]: + if self.partition is None: + return None + cps = self.partition.compute_partition + if not cps: + return None + return sorted(cps, key=lambda p: p.gpu_id)[0].partition_type + + @property + def ref_firmware_versions(self) -> Optional[dict[str, str]]: + ids = ( + list(self.analysis_firmware_ids) + if self.analysis_firmware_ids is not None + else list(_DEFAULT_ANALYSIS_FW_IDS) + ) + return _first_observed_fw_versions(self.firmware, ids) or None + + @property + def ref_pldm_version(self) -> Optional[str]: + fw = self.ref_firmware_versions + return fw.get("PLDM_BUNDLE") if fw else None + + @property + def ref_ep_vendor_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.vendor_id if ss else None + + @property + def ref_ep_subvendor_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.subvendor_id if ss else None + + @property + def ref_ep_device_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.device_id if ss else None + + @property + def ref_ep_subsystem_id(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.subsystem_id if ss else None + + @property + def ref_ep_market_name(self) -> Optional[str]: + ss = self._sorted_static_gpus() + return ss[0].asic.market_name if ss else None + + @property + def ref_xgmi_rates(self) -> Optional[list[float]]: + xm = self.xgmi_metric + if not xm: + return None + rates: set[float] = set() + for m in xm: + br = m.link_metrics.bit_rate + if br is None or br.value is None: + continue + try: + rates.add(float(br.value)) + except (TypeError, ValueError): + continue + return sorted(rates) if rates else None + + def build_analysis_ref(self) -> AmdSmiAnalysisRef: + """Build ``AmdSmiAnalysisRef`` from current field values""" + return AmdSmiAnalysisRef( + gpu_processes_max=self.ref_gpu_processes_max, + max_power_w=self.ref_max_power_w, + amdgpu_drv_version=self.ref_amdgpu_drv_version, + mem_part_mode=self.ref_mem_part_mode, + compute_part_mode=self.ref_compute_part_mode, + firmware_versions=self.ref_firmware_versions, + pldm_version=self.ref_pldm_version, + ep_vendor_id=self.ref_ep_vendor_id, + ep_subvendor_id=self.ref_ep_subvendor_id, + ep_device_id=self.ref_ep_device_id, + ep_subsystem_id=self.ref_ep_subsystem_id, + ep_market_name=self.ref_ep_market_name, + xgmi_rates=self.ref_xgmi_rates, + ) + + +_DEFAULT_ANALYSIS_FW_IDS: tuple[str, ...] = ("PLDM_BUNDLE",) + + +def _first_observed_fw_versions(firmware: Optional[list[Fw]], fw_ids: list[str]) -> dict[str, str]: + """For each ``fw_id``, take the version from the lowest GPU index that reports it.""" + out: dict[str, str] = {} + if not firmware or not fw_ids: + return out + need = set(fw_ids) + for fw in sorted(firmware, key=lambda f: f.gpu): + if isinstance(fw.fw_list, str): + continue + for item in fw.fw_list: + if item.fw_id in need and item.fw_id not in out: + out[item.fw_id] = item.fw_version + need.discard(item.fw_id) + if not need: + break + return out diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 333f37ae..3a5d2cfb 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -26,25 +26,92 @@ from datetime import datetime from typing import Optional +from pydantic import Field + from nodescraper.models import AnalyzerArgs +from nodescraper.plugins.inband.amdsmi.amdsmidata import AmdSmiDataModel class AmdSmiAnalyzerArgs(AnalyzerArgs): + check_static_data: bool = Field( + default=False, + description="If True, run static data checks (e.g. driver version, partition mode).", + ) + expected_gpu_processes: Optional[int] = Field( + default=None, description="Expected number of GPU processes." + ) + expected_max_power: Optional[int] = Field( + default=None, description="Expected maximum power value (e.g. watts)." + ) + expected_driver_version: Optional[str] = Field( + default=None, description="Expected AMD driver version string." + ) + expected_memory_partition_mode: Optional[str] = Field( + default=None, description="Expected memory partition mode (e.g. sp3, dp)." + ) + expected_compute_partition_mode: Optional[str] = Field( + default=None, description="Expected compute partition mode." + ) + expected_firmware_versions: Optional[dict[str, str]] = Field( + default=None, + description="Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).", + ) + l0_to_recovery_count_error_threshold: Optional[int] = Field( + default=3, + description="L0-to-recovery count above which an error is raised.", + ) + l0_to_recovery_count_warning_threshold: Optional[int] = Field( + default=1, + description="L0-to-recovery count above which a warning is raised.", + ) + vendorid_ep: Optional[str] = Field( + default=None, description="Expected endpoint vendor ID (e.g. for PCIe)." + ) + vendorid_ep_vf: Optional[str] = Field( + default=None, description="Expected endpoint VF vendor ID." + ) + devid_ep: Optional[str] = Field(default=None, description="Expected endpoint device ID.") + devid_ep_vf: Optional[str] = Field(default=None, description="Expected endpoint VF device ID.") + sku_name: Optional[str] = Field(default=None, description="Expected SKU name string for GPU.") + expected_xgmi_speed: Optional[list[float]] = Field( + default=None, description="Expected xGMI speed value(s) (e.g. link rate)." + ) + analysis_range_start: Optional[datetime] = Field( + default=None, description="Start of time range for time-windowed analysis." + ) + analysis_range_end: Optional[datetime] = Field( + default=None, description="End of time range for time-windowed analysis." + ) + + @classmethod + def build_from_model(cls, datamodel: AmdSmiDataModel) -> "AmdSmiAnalyzerArgs": + """Build analyzer args from data model (reference snapshot set by collector). + + Args: + datamodel (AmdSmiDataModel): data model for plugin - check_static_data: bool = False - expected_gpu_processes: Optional[int] = None - expected_max_power: Optional[int] = None - expected_driver_version: Optional[str] = None - expected_memory_partition_mode: Optional[str] = None - expected_compute_partition_mode: Optional[str] = None - expected_pldm_version: Optional[str] = None - l0_to_recovery_count_error_threshold: Optional[int] = 3 - l0_to_recovery_count_warning_threshold: Optional[int] = 1 - vendorid_ep: Optional[str] = None - vendorid_ep_vf: Optional[str] = None - devid_ep: Optional[str] = None - devid_ep_vf: Optional[str] = None - sku_name: Optional[str] = None - expected_xgmi_speed: Optional[list[float]] = None - analysis_range_start: Optional[datetime] = None - analysis_range_end: Optional[datetime] = None + Returns: + AmdSmiAnalyzerArgs: instance of analyzer args class + """ + r = datamodel.analysis_ref + if r is None: + return cls() + fw_expect: dict[str, str] = {} + if r.firmware_versions: + fw_expect.update(r.firmware_versions) + if r.pldm_version is not None and "PLDM_BUNDLE" not in fw_expect: + fw_expect["PLDM_BUNDLE"] = r.pldm_version + return cls( + expected_gpu_processes=r.gpu_processes_max, + expected_max_power=r.max_power_w, + expected_driver_version=r.amdgpu_drv_version, + expected_memory_partition_mode=r.mem_part_mode, + expected_compute_partition_mode=r.compute_part_mode, + expected_firmware_versions=dict(fw_expect) if fw_expect else None, + vendorid_ep=r.ep_vendor_id, + vendorid_ep_vf=r.ep_subvendor_id, + devid_ep=r.ep_device_id, + devid_ep_vf=r.ep_subsystem_id, + sku_name=r.ep_market_name, + expected_xgmi_speed=r.xgmi_rates, + ) diff --git a/nodescraper/plugins/inband/amdsmi/collector_args.py b/nodescraper/plugins/inband/amdsmi/collector_args.py index 97b5f904..4fedc39b 100644 --- a/nodescraper/plugins/inband/amdsmi/collector_args.py +++ b/nodescraper/plugins/inband/amdsmi/collector_args.py @@ -25,10 +25,19 @@ ############################################################################### from typing import Optional +from pydantic import Field + from nodescraper.models import CollectorArgs class AmdSmiCollectorArgs(CollectorArgs): """Collector arguments for AmdSmiPlugin""" - cper_file_path: Optional[str] = None + analysis_firmware_ids: Optional[list[str]] = Field( + default=None, + description=("amd-smi fw_id values to record in analysis_ref.firmware_versions "), + ) + cper_file_path: Optional[str] = Field( + default=None, + description="Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file).", + ) diff --git a/nodescraper/plugins/inband/bios/analyzer_args.py b/nodescraper/plugins/inband/bios/analyzer_args.py index 2775b74a..6b8f2213 100644 --- a/nodescraper/plugins/inband/bios/analyzer_args.py +++ b/nodescraper/plugins/inband/bios/analyzer_args.py @@ -32,8 +32,14 @@ class BiosAnalyzerArgs(AnalyzerArgs): - exp_bios_version: list[str] = Field(default_factory=list) - regex_match: bool = False + exp_bios_version: list[str] = Field( + default_factory=list, + description="Expected BIOS version(s) to match against collected value (str or list).", + ) + regex_match: bool = Field( + default=False, + description="If True, match exp_bios_version as regex; otherwise exact match.", + ) @field_validator("exp_bios_version", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/cmdline/analyzer_args.py b/nodescraper/plugins/inband/cmdline/analyzer_args.py index e4a34422..4c50ac70 100644 --- a/nodescraper/plugins/inband/cmdline/analyzer_args.py +++ b/nodescraper/plugins/inband/cmdline/analyzer_args.py @@ -40,10 +40,22 @@ class CmdlineAnalyzerArgs(AnalyzerArgs): - required_cmdline: Union[str, List] = Field(default_factory=list) - banned_cmdline: Union[str, List] = Field(default_factory=list) - os_overrides: Dict[str, OverrideConfig] = Field(default_factory=dict) - platform_overrides: Dict[str, OverrideConfig] = Field(default_factory=dict) + required_cmdline: Union[str, List] = Field( + default_factory=list, + description="Command-line parameters that must be present (e.g. 'pci=bfsort').", + ) + banned_cmdline: Union[str, List] = Field( + default_factory=list, + description="Command-line parameters that must not be present.", + ) + os_overrides: Dict[str, OverrideConfig] = Field( + default_factory=dict, + description="Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).", + ) + platform_overrides: Dict[str, OverrideConfig] = Field( + default_factory=dict, + description="Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform).", + ) @field_validator("required_cmdline", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/device_enumeration/analyzer_args.py b/nodescraper/plugins/inband/device_enumeration/analyzer_args.py index 8f74ed00..5dc8b38c 100644 --- a/nodescraper/plugins/inband/device_enumeration/analyzer_args.py +++ b/nodescraper/plugins/inband/device_enumeration/analyzer_args.py @@ -25,7 +25,7 @@ ############################################################################### from typing import Any, Optional -from pydantic import field_validator +from pydantic import Field, field_validator from nodescraper.models import AnalyzerArgs @@ -33,9 +33,18 @@ class DeviceEnumerationAnalyzerArgs(AnalyzerArgs): - cpu_count: Optional[list[int]] = None - gpu_count: Optional[list[int]] = None - vf_count: Optional[list[int]] = None + cpu_count: Optional[list[int]] = Field( + default=None, + description="Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.", + ) + gpu_count: Optional[list[int]] = Field( + default=None, + description="Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.", + ) + vf_count: Optional[list[int]] = Field( + default=None, + description="Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list.", + ) @field_validator("cpu_count", "gpu_count", "vf_count", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/dimm/collector_args.py b/nodescraper/plugins/inband/dimm/collector_args.py index 6e1f6897..432c9cd3 100644 --- a/nodescraper/plugins/inband/dimm/collector_args.py +++ b/nodescraper/plugins/inband/dimm/collector_args.py @@ -24,8 +24,13 @@ # ############################################################################### +from pydantic import Field + from nodescraper.models import CollectorArgs class DimmCollectorArgs(CollectorArgs): - skip_sudo: bool = False + skip_sudo: bool = Field( + default=False, + description="If True, do not use sudo when running dmidecode or wmic for memory info.", + ) diff --git a/nodescraper/plugins/inband/dkms/analyzer_args.py b/nodescraper/plugins/inband/dkms/analyzer_args.py index d6a3a8db..9acaeac2 100644 --- a/nodescraper/plugins/inband/dkms/analyzer_args.py +++ b/nodescraper/plugins/inband/dkms/analyzer_args.py @@ -32,9 +32,18 @@ class DkmsAnalyzerArgs(AnalyzerArgs): - dkms_status: Union[str, list] = Field(default_factory=list) - dkms_version: Union[str, list] = Field(default_factory=list) - regex_match: bool = False + dkms_status: Union[str, list] = Field( + default_factory=list, + description="Expected dkms status string(s) to match (e.g. 'amd/1.0.0'). At least one of dkms_status or dkms_version required.", + ) + dkms_version: Union[str, list] = Field( + default_factory=list, + description="Expected dkms version string(s) to match. At least one of dkms_status or dkms_version required.", + ) + regex_match: bool = Field( + default=False, + description="If True, match dkms_status and dkms_version as regex; otherwise exact match.", + ) def model_post_init(self, __context: Any) -> None: if not self.dkms_status and not self.dkms_version: diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index f8783761..cd9ba765 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -25,13 +25,30 @@ ############################################################################### from typing import Optional, Union +from pydantic import Field + from nodescraper.base.regexanalyzer import ErrorRegex from nodescraper.models import TimeRangeAnalysisArgs class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): - check_unknown_dmesg_errors: Optional[bool] = True - exclude_category: Optional[set[str]] = None - interval_to_collapse_event: int = 60 - num_timestamps: int = 3 - error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = None + check_unknown_dmesg_errors: Optional[bool] = Field( + default=True, + description="If True, treat unknown/unmatched dmesg error lines as failures.", + ) + exclude_category: Optional[set[str]] = Field( + default=None, + description="Set of error categories to exclude from analysis.", + ) + interval_to_collapse_event: int = Field( + default=60, + description="Seconds within which repeated events are collapsed into one (for rate limiting).", + ) + num_timestamps: int = Field( + default=3, + description="Number of timestamps to include per event in output.", + ) + error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = Field( + default=None, + description="Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern.", + ) diff --git a/nodescraper/plugins/inband/dmesg/collector_args.py b/nodescraper/plugins/inband/dmesg/collector_args.py index 22d85f17..24517eaf 100644 --- a/nodescraper/plugins/inband/dmesg/collector_args.py +++ b/nodescraper/plugins/inband/dmesg/collector_args.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +from pydantic import Field from nodescraper.models import CollectorArgs @@ -34,6 +35,15 @@ class DmesgCollectorArgs(CollectorArgs): CollectorArgs (CollectorArgs): specific dmesg collector args """ - collect_rotated_logs: bool = False - skip_sudo: bool = False - log_dmesg_data: bool = True + collect_rotated_logs: bool = Field( + default=False, + description="If True, also collect rotated dmesg log files from /var/log/dmesg*.", + ) + skip_sudo: bool = Field( + default=False, + description="If True, do not use sudo when running dmesg or listing log files.", + ) + log_dmesg_data: bool = Field( + default=True, + description="If True, log the collected dmesg output in artifacts.", + ) diff --git a/nodescraper/plugins/inband/journal/analyzer_args.py b/nodescraper/plugins/inband/journal/analyzer_args.py index 858b1237..ee5390c1 100644 --- a/nodescraper/plugins/inband/journal/analyzer_args.py +++ b/nodescraper/plugins/inband/journal/analyzer_args.py @@ -25,17 +25,19 @@ ############################################################################### from typing import Optional +from pydantic import Field + from nodescraper.models import TimeRangeAnalysisArgs class JournalAnalyzerArgs(TimeRangeAnalysisArgs): """Arguments for journal analyzer""" - check_priority: Optional[int] = None - """Check against journal log priority levels. - emergency(0), alert(1), critical(2), error(3), warning(4), notice(5), info(6), debug(7). - If a journal log entry has a priority level less than or equal to check_priority, - an ERROR event will be raised.""" - - group: bool = True - """Groups entries if they have the same priority and the same message""" + check_priority: Optional[int] = Field( + default=None, + description="Check against journal log priority (0=emergency..7=debug). If an entry has priority <= check_priority, an ERROR event is raised.", + ) + group: bool = Field( + default=True, + description="If True, group entries that have the same priority and message.", + ) diff --git a/nodescraper/plugins/inband/journal/collector_args.py b/nodescraper/plugins/inband/journal/collector_args.py index 583c94ab..6832ffc7 100644 --- a/nodescraper/plugins/inband/journal/collector_args.py +++ b/nodescraper/plugins/inband/journal/collector_args.py @@ -26,8 +26,13 @@ from typing import Optional +from pydantic import Field + from nodescraper.models import CollectorArgs class JournalCollectorArgs(CollectorArgs): - boot: Optional[int] = None + boot: Optional[int] = Field( + default=None, + description="Optional boot ID to limit journal collection to a specific boot.", + ) diff --git a/nodescraper/plugins/inband/kernel/analyzer_args.py b/nodescraper/plugins/inband/kernel/analyzer_args.py index d7be40b6..29a2ec1e 100644 --- a/nodescraper/plugins/inband/kernel/analyzer_args.py +++ b/nodescraper/plugins/inband/kernel/analyzer_args.py @@ -32,9 +32,18 @@ class KernelAnalyzerArgs(AnalyzerArgs): - exp_kernel: Union[str, list] = Field(default_factory=list) - exp_numa: Optional[int] = None - regex_match: bool = False + exp_kernel: Union[str, list] = Field( + default_factory=list, + description="Expected kernel version string(s) to match (e.g. from uname -a).", + ) + exp_numa: Optional[int] = Field( + default=None, + description="Expected value for kernel.numa_balancing (e.g. 0 or 1).", + ) + regex_match: bool = Field( + default=False, + description="If True, match exp_kernel as regex; otherwise exact match.", + ) @field_validator("exp_kernel", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/kernel_module/analyzer_args.py b/nodescraper/plugins/inband/kernel_module/analyzer_args.py index fcf4fb4e..1aa8c88f 100644 --- a/nodescraper/plugins/inband/kernel_module/analyzer_args.py +++ b/nodescraper/plugins/inband/kernel_module/analyzer_args.py @@ -25,6 +25,8 @@ ############################################################################### import re +from pydantic import Field + from nodescraper.models import AnalyzerArgs from nodescraper.plugins.inband.kernel_module.kernel_module_data import ( KernelModuleDataModel, @@ -32,8 +34,14 @@ class KernelModuleAnalyzerArgs(AnalyzerArgs): - kernel_modules: dict[str, dict] = {} - regex_filter: list[str] = ["amd"] + kernel_modules: dict[str, dict] = Field( + default_factory=dict, + description="Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match.", + ) + regex_filter: list[str] = Field( + default_factory=lambda: ["amd"], + description="List of regex patterns to filter which collected modules are checked (default: amd).", + ) @classmethod def build_from_model(cls, datamodel: KernelModuleDataModel) -> "KernelModuleAnalyzerArgs": diff --git a/nodescraper/plugins/inband/memory/analyzer_args.py b/nodescraper/plugins/inband/memory/analyzer_args.py index 968641ca..88dcd5d2 100644 --- a/nodescraper/plugins/inband/memory/analyzer_args.py +++ b/nodescraper/plugins/inband/memory/analyzer_args.py @@ -23,14 +23,22 @@ # SOFTWARE. # ############################################################################### +from pydantic import Field + from nodescraper.models.analyzerargs import AnalyzerArgs from .memorydata import MemoryDataModel class MemoryAnalyzerArgs(AnalyzerArgs): - ratio: float = 0.66 - memory_threshold: str = "30Gi" + ratio: float = Field( + default=0.66, + description="Required free-memory ratio (0-1). Analysis fails if free/total < ratio.", + ) + memory_threshold: str = Field( + default="30Gi", + description="Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient.", + ) @classmethod def build_from_model(cls, datamodel: MemoryDataModel) -> "MemoryAnalyzerArgs": diff --git a/nodescraper/plugins/inband/network/analyzer_args.py b/nodescraper/plugins/inband/network/analyzer_args.py new file mode 100644 index 00000000..f2e63047 --- /dev/null +++ b/nodescraper/plugins/inband/network/analyzer_args.py @@ -0,0 +1,40 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional, Union + +from pydantic import Field + +from nodescraper.base.regexanalyzer import ErrorRegex +from nodescraper.models import AnalyzerArgs + + +class NetworkAnalyzerArgs(AnalyzerArgs): + """Arguments for the network analyzer plugin.""" + + error_regex: Optional[Union[list[ErrorRegex], list[dict]]] = Field( + default=None, + description="Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern.", + ) diff --git a/nodescraper/plugins/inband/network/collector_args.py b/nodescraper/plugins/inband/network/collector_args.py index 70e84cf4..61ca7f23 100644 --- a/nodescraper/plugins/inband/network/collector_args.py +++ b/nodescraper/plugins/inband/network/collector_args.py @@ -26,9 +26,17 @@ from typing import Literal, Optional +from pydantic import Field + from nodescraper.models import CollectorArgs class NetworkCollectorArgs(CollectorArgs): - url: Optional[str] = None - netprobe: Optional[Literal["ping", "wget", "curl"]] = None + url: Optional[str] = Field( + default=None, + description="Optional URL to probe for network connectivity (used with netprobe).", + ) + netprobe: Optional[Literal["ping", "wget", "curl"]] = Field( + default=None, + description="Tool to use for network connectivity probe: ping, wget, or curl.", + ) diff --git a/nodescraper/plugins/inband/network/network_analyzer.py b/nodescraper/plugins/inband/network/network_analyzer.py new file mode 100644 index 00000000..dbd39fc8 --- /dev/null +++ b/nodescraper/plugins/inband/network/network_analyzer.py @@ -0,0 +1,123 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from typing import Optional + +from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus +from nodescraper.models import TaskResult + +from .analyzer_args import NetworkAnalyzerArgs +from .networkdata import NetworkDataModel + + +class NetworkAnalyzer(RegexAnalyzer[NetworkDataModel, NetworkAnalyzerArgs]): + """Check network statistics for errors (PFC and other network error counters).""" + + DATA_MODEL = NetworkDataModel + + # Regex patterns for error fields checked from network statistics + ERROR_REGEX: list[ErrorRegex] = [ + ErrorRegex( + regex=re.compile(r"^tx_pfc_frames$"), + message="tx_pfc_frames is non-zero", + event_category=EventCategory.NETWORK, + ), + ErrorRegex( + regex=re.compile(r"^tx_pfc_ena_frames_pri\d+$"), + message="tx_pfc_ena_frames_pri* is non-zero", + event_category=EventCategory.NETWORK, + ), + ErrorRegex( + regex=re.compile(r"^pfc_pri\d+_tx_transitions$"), + message="pfc_pri*_tx_transitions is non-zero", + event_category=EventCategory.NETWORK, + ), + ] + + def analyze_data( + self, data: NetworkDataModel, args: Optional[NetworkAnalyzerArgs] = None + ) -> TaskResult: + """Analyze network statistics for non-zero error counters. + Currently only checks ethtool -S statistics. + + Args: + data: Network data model with ethtool_info containing interface statistics. + args: Optional analyzer arguments with custom error regex support. + + Returns: + TaskResult with status OK if no errors, ERROR if any error counter > 0. + """ + if not data.ethtool_info: + self.result.message = "No network devices found" + self.result.status = ExecutionStatus.WARNING + return self.result + + if not args: + args = NetworkAnalyzerArgs() + + final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) + + error_state = False + for interface_name, ethtool_info in data.ethtool_info.items(): + errors_on_interface = [] # (error_field, value) + # Loop through all statistics in the ethtool statistics dict + for stat_name, stat_value in ethtool_info.statistics.items(): + # Check if this statistic matches any error field pattern + for error_regex_obj in final_error_regex: + if error_regex_obj.regex.match(stat_name): + # Try to convert string value to int + try: + value = int(stat_value) + except (ValueError, TypeError): + break # Skip non-numeric values + + if value > 0: + errors_on_interface.append((stat_name, value)) + break # Stop checking patterns once we find a match + + if errors_on_interface: + error_state = True + error_names = [e[0] for e in errors_on_interface] + errors_data = {field: value for field, value in errors_on_interface} + self._log_event( + category=EventCategory.NETWORK, + description=f"Network error detected on {interface_name}: [{', '.join(error_names)}]", + data={ + "interface": interface_name, + "errors": errors_data, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + + if error_state: + self.result.message = "Network errors detected in statistics" + self.result.status = ExecutionStatus.ERROR + else: + self.result.message = "No network errors detected in statistics" + self.result.status = ExecutionStatus.OK + return self.result diff --git a/nodescraper/plugins/inband/network/network_plugin.py b/nodescraper/plugins/inband/network/network_plugin.py index 0ba55e79..7e1fe518 100644 --- a/nodescraper/plugins/inband/network/network_plugin.py +++ b/nodescraper/plugins/inband/network/network_plugin.py @@ -25,12 +25,14 @@ ############################################################################### from nodescraper.base import InBandDataPlugin +from .analyzer_args import NetworkAnalyzerArgs from .collector_args import NetworkCollectorArgs +from .network_analyzer import NetworkAnalyzer from .network_collector import NetworkCollector from .networkdata import NetworkDataModel -class NetworkPlugin(InBandDataPlugin[NetworkDataModel, NetworkCollectorArgs, None]): +class NetworkPlugin(InBandDataPlugin[NetworkDataModel, NetworkCollectorArgs, NetworkAnalyzerArgs]): """Plugin for collection of network configuration data""" DATA_MODEL = NetworkDataModel @@ -38,3 +40,7 @@ class NetworkPlugin(InBandDataPlugin[NetworkDataModel, NetworkCollectorArgs, Non COLLECTOR = NetworkCollector COLLECTOR_ARGS = NetworkCollectorArgs + + ANALYZER = NetworkAnalyzer + + ANALYZER_ARGS = NetworkAnalyzerArgs diff --git a/nodescraper/plugins/inband/nic/collector_args.py b/nodescraper/plugins/inband/nic/collector_args.py index 8085b632..99b4c7fa 100644 --- a/nodescraper/plugins/inband/nic/collector_args.py +++ b/nodescraper/plugins/inband/nic/collector_args.py @@ -25,12 +25,23 @@ ############################################################################### from typing import List, Optional +from pydantic import Field + from nodescraper.models import CollectorArgs class NicCollectorArgs(CollectorArgs): - """ """ + """Collector arguments for NicPlugin (niccli/nicctl).""" - commands: Optional[List[str]] = None - use_sudo_niccli: bool = True - use_sudo_nicctl: bool = True + commands: Optional[List[str]] = Field( + default=None, + description="Optional list of niccli/nicctl commands to run. When None, default command set is used.", + ) + use_sudo_niccli: bool = Field( + default=True, + description="If True, run niccli commands with sudo when required.", + ) + use_sudo_nicctl: bool = Field( + default=True, + description="If True, run nicctl commands with sudo when required.", + ) diff --git a/nodescraper/plugins/inband/nic/nic_collector.py b/nodescraper/plugins/inband/nic/nic_collector.py index fd38939a..021fa3e3 100644 --- a/nodescraper/plugins/inband/nic/nic_collector.py +++ b/nodescraper/plugins/inband/nic/nic_collector.py @@ -67,105 +67,8 @@ command_to_canonical_key, ) -# Default commands: niccli (Broadcom) and nicctl (Pensando). Use {device_num} and {card_id} placeholders. -NICCLI_VERSION_CMD = "niccli --version" -NICCLI_VERSION_LEGACY_MAX = 233 # Commands below use -dev/-getoption/getqos; for version > this use --dev/--getoption/qos --ets --show -NICCLI_LIST_CMD = "niccli --list" -NICCLI_LIST_DEVICES_CMD = "niccli --list_devices" # new (> v233) -NICCLI_LIST_DEVICES_CMD_LEGACY = "niccli --listdev" # legacy (<= v233) -NICCLI_DISCOVERY_CMDS_LEGACY = [ - NICCLI_LIST_DEVICES_CMD_LEGACY, - NICCLI_LIST_CMD, -] -NICCLI_DISCOVERY_CMDS_NEW = [ - NICCLI_LIST_DEVICES_CMD, - NICCLI_LIST_CMD, -] -# All discovery command variants (for canonical key); default list for backward compat = legacy -NICCLI_DISCOVERY_CMDS = NICCLI_DISCOVERY_CMDS_LEGACY -NICCLI_DISCOVERY_CMDS_ALL = frozenset( - [NICCLI_LIST_DEVICES_CMD_LEGACY, NICCLI_LIST_DEVICES_CMD, NICCLI_LIST_CMD] -) -# Legacy (<= v233): single-dash options and getqos -NICCLI_SUPPORT_RDMA_CMD_TEMPLATE_LEGACY = ( - "niccli -dev {device_num} nvm -getoption support_rdma -scope 0" -) -NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE_LEGACY = ( - "niccli -dev {device_num} nvm -getoption performance_profile" -) -NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE_LEGACY = ( - "niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering" -) -NICCLI_QOS_CMD_TEMPLATE_LEGACY = "niccli -dev {device_num} getqos" -NICCLI_PER_DEVICE_TEMPLATES_LEGACY = [ - NICCLI_SUPPORT_RDMA_CMD_TEMPLATE_LEGACY, - NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE_LEGACY, - NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE_LEGACY, - NICCLI_QOS_CMD_TEMPLATE_LEGACY, -] -# New (> v233): double-dash options and qos --ets --show -NICCLI_SUPPORT_RDMA_CMD_TEMPLATE_NEW = "niccli --dev {device_num} nvm --getoption support_rdma" -NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE_NEW = ( - "niccli --dev {device_num} nvm --getoption performance_profile" -) -NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE_NEW = ( - "niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering" -) -NICCLI_QOS_CMD_TEMPLATE_NEW = "niccli --dev {device_num} qos --ets --show" -NICCLI_PER_DEVICE_TEMPLATES_NEW = [ - NICCLI_SUPPORT_RDMA_CMD_TEMPLATE_NEW, - NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE_NEW, - NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE_NEW, - NICCLI_QOS_CMD_TEMPLATE_NEW, -] -# Backward compatibility: default to legacy templates (used by _default_commands and any code that imports these) -NICCLI_SUPPORT_RDMA_CMD_TEMPLATE = NICCLI_SUPPORT_RDMA_CMD_TEMPLATE_LEGACY -NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE = NICCLI_PERFORMANCE_PROFILE_CMD_TEMPLATE_LEGACY -NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE = NICCLI_PCIE_RELAXED_ORDERING_CMD_TEMPLATE_LEGACY -NICCLI_PER_DEVICE_TEMPLATES = NICCLI_PER_DEVICE_TEMPLATES_LEGACY -# Text-format command for card discovery and pensando_nic_cards (no --json). -NICCTL_CARD_TEXT_CMD = "nicctl show card" -NICCTL_GLOBAL_COMMANDS = [ - "nicctl --version", - "nicctl show card flash partition --json", - "nicctl show card interrupts --json", - "nicctl show card logs --non-persistent", - "nicctl show card logs --boot-fault", - "nicctl show card logs --persistent", - "nicctl show card profile --json", - "nicctl show card time --json", - "nicctl show card statistics packet-buffer summary --json", - "nicctl show lif statistics --json", - "nicctl show lif internal queue-to-ud-pinning", - "nicctl show pipeline internal anomalies", - "nicctl show pipeline internal rsq-ring", - "nicctl show pipeline internal statistics memory", - "nicctl show port fsm", - "nicctl show port transceiver --json", - "nicctl show port statistics --json", - "nicctl show port internal mac", - "nicctl show qos headroom --json", - "nicctl show rdma queue --json", - "nicctl show rdma queue-pair --detail --json", - "nicctl show version firmware", -] -NICCTL_PER_CARD_TEMPLATES = [ - "nicctl show dcqcn --card {card_id} --json", - "nicctl show card hardware-config --card {card_id}", -] - -# Legacy text-format commands for Pensando (no --json); parsed by _parse_nicctl_* into pensando_nic_*. -NICCTL_LEGACY_TEXT_COMMANDS = [ - "nicctl show card", - "nicctl show dcqcn", - "nicctl show environment", - "nicctl show lif", - "nicctl show pcie ats", - "nicctl show port", - "nicctl show qos", - "nicctl show rdma statistics", - "nicctl show version host-software", -] +# niccli version threshold: legacy (<=233) vs new (>233) command syntax. +NICCLI_VERSION_LEGACY_MAX = 233 # Commands use -dev/-getoption/getqos; for version > this use --dev/--getoption/qos --ets --show # Max lengths for fields included in the serialized datamodel (keeps nicclidatamodel.json small). MAX_COMMAND_LENGTH_IN_DATAMODEL = 256 @@ -188,64 +91,6 @@ def _parse_niccli_version(stdout: str) -> Optional[int]: return None -def _get_niccli_per_device_templates(version: Optional[int]) -> List[str]: - """Return the per-device command templates for the given niccli version. - For version > NICCLI_VERSION_LEGACY_MAX (233) use new syntax (--dev, --getoption, qos --ets --show). - Otherwise use legacy syntax (-dev, -getoption, getqos). If version is None, default to legacy. - """ - if version is not None and version > NICCLI_VERSION_LEGACY_MAX: - return NICCLI_PER_DEVICE_TEMPLATES_NEW.copy() - return NICCLI_PER_DEVICE_TEMPLATES_LEGACY.copy() - - -def _get_niccli_discovery_commands(version: Optional[int]) -> List[str]: - """Return the discovery commands for the given niccli version. - Legacy (<= v233) uses --listdev; new (> v233) uses --list_devices. If version is None, default to legacy. - """ - if version is not None and version > NICCLI_VERSION_LEGACY_MAX: - return NICCLI_DISCOVERY_CMDS_NEW.copy() - return NICCLI_DISCOVERY_CMDS_LEGACY.copy() - - -# Commands whose output is very long; store only as file artifacts, not in data model. -def _is_artifact_only_command(cmd: str) -> bool: - c = cmd.strip() - if c.startswith("nicctl show card logs "): - return True - if "nicctl show card hardware-config --card " in c: - return True - if c == "nicctl show port fsm": - return True - if c.startswith("nicctl show pipeline internal "): - return True - if c == "nicctl show rdma queue-pair --detail --json": - return True - if c == "nicctl show lif internal queue-to-ud-pinning": - return True - if c == "nicctl show port internal mac": - return True - return False - - -def _merged_canonical_key(cmd: str) -> str: - """Return a single canonical key for commands that collect the same data.""" - if cmd in NICCLI_DISCOVERY_CMDS_ALL: - return "niccli_discovery" - return command_to_canonical_key(cmd) - - -def _default_commands() -> List[str]: - """Return the default flat list of command templates (with placeholders).""" - out: List[str] = [NICCLI_LIST_CMD] - for t in NICCLI_PER_DEVICE_TEMPLATES: - out.append(t) - for c in NICCTL_GLOBAL_COMMANDS: - out.append(c) - for t in NICCTL_PER_CARD_TEMPLATES: - out.append(t) - return out - - def _parse_niccli_qos_app_entries(stdout: str) -> List[NicCliQosAppEntry]: """Parse APP# blocks from niccli qos output into NicCliQosAppEntry list.""" entries: List[NicCliQosAppEntry] = [] @@ -472,6 +317,106 @@ class NicCollector(InBandDataCollector[NicDataModel, NicCollectorArgs]): DATA_MODEL = NicDataModel + # Default commands: niccli (Broadcom) and nicctl (Pensando). Use {device_num} and {card_id} placeholders. + # Names use CMD_* so docs/generate_plugin_doc_bundle.py can list them (dir(NicCollector) CMD*). + CMD_NICCLI_VERSION = "niccli --version" + CMD_NICCLI_LIST = "niccli --list" + CMD_NICCLI_LIST_DEVICES = "niccli --list_devices" # new (> v233) + CMD_NICCLI_LIST_DEVICES_LEGACY = "niccli --listdev" # legacy (<= v233) + CMD_NICCLI_DISCOVERY_LEGACY = [ + CMD_NICCLI_LIST_DEVICES_LEGACY, + CMD_NICCLI_LIST, + ] + CMD_NICCLI_DISCOVERY_NEW = [ + CMD_NICCLI_LIST_DEVICES, + CMD_NICCLI_LIST, + ] + # All discovery command variants (for canonical key); default list for backward compat = legacy + CMD_NICCLI_DISCOVERY = CMD_NICCLI_DISCOVERY_LEGACY + CMD_NICCLI_DISCOVERY_ALL = frozenset( + [CMD_NICCLI_LIST_DEVICES_LEGACY, CMD_NICCLI_LIST_DEVICES, CMD_NICCLI_LIST] + ) + # Legacy (<= v233): single-dash options and getqos + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_LEGACY = ( + "niccli -dev {device_num} nvm -getoption support_rdma -scope 0" + ) + CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_LEGACY = ( + "niccli -dev {device_num} nvm -getoption performance_profile" + ) + CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_LEGACY = ( + "niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering" + ) + CMD_NICCLI_QOS_TEMPLATE_LEGACY = "niccli -dev {device_num} getqos" + CMD_NICCLI_PER_DEVICE_LEGACY = [ + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_LEGACY, + CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_LEGACY, + CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_LEGACY, + CMD_NICCLI_QOS_TEMPLATE_LEGACY, + ] + # New (> v233): double-dash options and qos --ets --show + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW = "niccli --dev {device_num} nvm --getoption support_rdma" + CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW = ( + "niccli --dev {device_num} nvm --getoption performance_profile" + ) + CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_NEW = ( + "niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering" + ) + CMD_NICCLI_QOS_TEMPLATE_NEW = "niccli --dev {device_num} qos --ets --show" + CMD_NICCLI_PER_DEVICE_NEW = [ + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW, + CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW, + CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_NEW, + CMD_NICCLI_QOS_TEMPLATE_NEW, + ] + # Backward compatibility: default to legacy templates + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE = CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_LEGACY + CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE = CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_LEGACY + CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE = CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_LEGACY + CMD_NICCLI_PER_DEVICE = CMD_NICCLI_PER_DEVICE_LEGACY + # Text-format command for card discovery and pensando_nic_cards (no --json). + CMD_NICCTL_CARD_TEXT = "nicctl show card" + CMD_NICCTL_GLOBAL = [ + "nicctl --version", + "nicctl show card flash partition --json", + "nicctl show card interrupts --json", + "nicctl show card logs --non-persistent", + "nicctl show card logs --boot-fault", + "nicctl show card logs --persistent", + "nicctl show card profile --json", + "nicctl show card time --json", + "nicctl show card statistics packet-buffer summary --json", + "nicctl show lif statistics --json", + "nicctl show lif internal queue-to-ud-pinning", + "nicctl show pipeline internal anomalies", + "nicctl show pipeline internal rsq-ring", + "nicctl show pipeline internal statistics memory", + "nicctl show port fsm", + "nicctl show port transceiver --json", + "nicctl show port statistics --json", + "nicctl show port internal mac", + "nicctl show qos headroom --json", + "nicctl show rdma queue --json", + "nicctl show rdma queue-pair --detail --json", + "nicctl show version firmware", + ] + CMD_NICCTL_PER_CARD = [ + "nicctl show dcqcn --card {card_id} --json", + "nicctl show card hardware-config --card {card_id}", + ] + + # Legacy text-format commands for Pensando (no --json); parsed by _parse_nicctl_* into pensando_nic_*. + CMD_NICCTL_LEGACY_TEXT = [ + "nicctl show card", + "nicctl show dcqcn", + "nicctl show environment", + "nicctl show lif", + "nicctl show pcie ats", + "nicctl show port", + "nicctl show qos", + "nicctl show rdma statistics", + "nicctl show version host-software", + ] + def collect_data( self, args: Optional[NicCollectorArgs] = None, @@ -485,11 +430,11 @@ def collect_data( # Detect niccli version to choose command set (legacy <= v233 vs new > v233) niccli_version: Optional[int] = None - res_version = self._run_sut_cmd(NICCLI_VERSION_CMD, sudo=use_sudo_niccli) + res_version = self._run_sut_cmd(NicCollector.CMD_NICCLI_VERSION, sudo=use_sudo_niccli) if res_version.exit_code == 0 and res_version.stdout: niccli_version = _parse_niccli_version(res_version.stdout) - results[NICCLI_VERSION_CMD] = NicCommandResult( - command=NICCLI_VERSION_CMD, + results[NicCollector.CMD_NICCLI_VERSION] = NicCommandResult( + command=NicCollector.CMD_NICCLI_VERSION, stdout=res_version.stdout or "", stderr=res_version.stderr or "", exit_code=res_version.exit_code, @@ -514,9 +459,9 @@ def collect_data( # Discovery: card IDs from nicctl show card (text); same output used for pensando_nic_cards card_ids: List[str] = [] card_list_from_text: List[Dict[str, Any]] = [] - res_card = self._run_sut_cmd(NICCTL_CARD_TEXT_CMD, sudo=use_sudo_nicctl) - results[NICCTL_CARD_TEXT_CMD] = NicCommandResult( - command=NICCTL_CARD_TEXT_CMD, + res_card = self._run_sut_cmd(NicCollector.CMD_NICCTL_CARD_TEXT, sudo=use_sudo_nicctl) + results[NicCollector.CMD_NICCTL_CARD_TEXT] = NicCommandResult( + command=NicCollector.CMD_NICCTL_CARD_TEXT, stdout=res_card.stdout or "", stderr=res_card.stderr or "", exit_code=res_card.exit_code, @@ -545,13 +490,13 @@ def collect_data( for tpl in per_device_templates: for d in device_nums: commands_to_run.append(tpl.format(device_num=d)) - # nicctl global (card discovery already done via NICCTL_CARD_TEXT_CMD) - for c in NICCTL_GLOBAL_COMMANDS: + # nicctl global (card discovery already done via CMD_NICCTL_CARD_TEXT) + for c in NicCollector.CMD_NICCTL_GLOBAL: commands_to_run.append(c) - for tpl in NICCTL_PER_CARD_TEMPLATES: + for tpl in NicCollector.CMD_NICCTL_PER_CARD: for cid in card_ids: commands_to_run.append(tpl.format(card_id=cid)) - for cmd in NICCTL_LEGACY_TEXT_COMMANDS: + for cmd in NicCollector.CMD_NICCTL_LEGACY_TEXT: commands_to_run.append(cmd) # Run each command and store (artifact-only commands are not added to results / data model). @@ -1310,3 +1255,61 @@ def _parse_nicctl_version_firmware(self, stdout: str) -> List[PensandoNicVersion ) ) return entries + + +def _get_niccli_per_device_templates(version: Optional[int]) -> List[str]: + """Return the per-device command templates for the given niccli version. + For version > NICCLI_VERSION_LEGACY_MAX (233) use new syntax (--dev, --getoption, qos --ets --show). + Otherwise use legacy syntax (-dev, -getoption, getqos). If version is None, default to legacy. + """ + if version is not None and version > NICCLI_VERSION_LEGACY_MAX: + return NicCollector.CMD_NICCLI_PER_DEVICE_NEW.copy() + return NicCollector.CMD_NICCLI_PER_DEVICE_LEGACY.copy() + + +def _get_niccli_discovery_commands(version: Optional[int]) -> List[str]: + """Return the discovery commands for the given niccli version. + Legacy (<= v233) uses --listdev; new (> v233) uses --list_devices. If version is None, default to legacy. + """ + if version is not None and version > NICCLI_VERSION_LEGACY_MAX: + return NicCollector.CMD_NICCLI_DISCOVERY_NEW.copy() + return NicCollector.CMD_NICCLI_DISCOVERY_LEGACY.copy() + + +# Commands whose output is very long; store only as file artifacts, not in data model. +def _is_artifact_only_command(cmd: str) -> bool: + c = cmd.strip() + if c.startswith("nicctl show card logs "): + return True + if "nicctl show card hardware-config --card " in c: + return True + if c == "nicctl show port fsm": + return True + if c.startswith("nicctl show pipeline internal "): + return True + if c == "nicctl show rdma queue-pair --detail --json": + return True + if c == "nicctl show lif internal queue-to-ud-pinning": + return True + if c == "nicctl show port internal mac": + return True + return False + + +def _merged_canonical_key(cmd: str) -> str: + """Return a single canonical key for commands that collect the same data.""" + if cmd in NicCollector.CMD_NICCLI_DISCOVERY_ALL: + return "niccli_discovery" + return command_to_canonical_key(cmd) + + +def _default_commands() -> List[str]: + """Return the default flat list of command templates (with placeholders).""" + out: List[str] = [NicCollector.CMD_NICCLI_LIST] + for t in NicCollector.CMD_NICCLI_PER_DEVICE: + out.append(t) + for c in NicCollector.CMD_NICCTL_GLOBAL: + out.append(c) + for t in NicCollector.CMD_NICCTL_PER_CARD: + out.append(t) + return out diff --git a/nodescraper/plugins/inband/os/analyzer_args.py b/nodescraper/plugins/inband/os/analyzer_args.py index 52fd1124..58565123 100644 --- a/nodescraper/plugins/inband/os/analyzer_args.py +++ b/nodescraper/plugins/inband/os/analyzer_args.py @@ -32,8 +32,14 @@ class OsAnalyzerArgs(AnalyzerArgs): - exp_os: Union[str, list] = Field(default_factory=list) - exact_match: bool = True + exp_os: Union[str, list] = Field( + default_factory=list, + description="Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release).", + ) + exact_match: bool = Field( + default=True, + description="If True, require exact match for exp_os; otherwise substring match.", + ) @field_validator("exp_os", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/package/analyzer_args.py b/nodescraper/plugins/inband/package/analyzer_args.py index 62a34c1f..6b131465 100644 --- a/nodescraper/plugins/inband/package/analyzer_args.py +++ b/nodescraper/plugins/inband/package/analyzer_args.py @@ -32,11 +32,22 @@ class PackageAnalyzerArgs(AnalyzerArgs): - exp_package_ver: Dict[str, Optional[str]] = Field(default_factory=dict) - regex_match: bool = False - # rocm_regex is optional and should be specified in plugin_config.json if needed - rocm_regex: Optional[str] = None - enable_rocm_regex: bool = False + exp_package_ver: Dict[str, Optional[str]] = Field( + default_factory=dict, + description="Map package name -> expected version (None = any version). Checked against installed packages.", + ) + regex_match: bool = Field( + default=False, + description="If True, match package versions with regex; otherwise exact or prefix match.", + ) + rocm_regex: Optional[str] = Field( + default=None, + description="Optional regex to identify ROCm package version (used when enable_rocm_regex is True).", + ) + enable_rocm_regex: bool = Field( + default=False, + description="If True, use rocm_regex (or default pattern) to extract ROCm version for checks.", + ) @classmethod def build_from_model(cls, datamodel: PackageDataModel) -> "PackageAnalyzerArgs": diff --git a/nodescraper/plugins/inband/pcie/analyzer_args.py b/nodescraper/plugins/inband/pcie/analyzer_args.py index dc3490a4..e42992cd 100644 --- a/nodescraper/plugins/inband/pcie/analyzer_args.py +++ b/nodescraper/plugins/inband/pcie/analyzer_args.py @@ -25,6 +25,8 @@ ############################################################################### from typing import Dict, Optional, Union +from pydantic import Field + from nodescraper.models import AnalyzerArgs @@ -41,13 +43,24 @@ class PcieAnalyzerArgs(AnalyzerArgs): exp_ten_bit_tag_req_en: Expected 10-bit tag request enable (int for all devices, dict for specific device IDs) """ - exp_speed: int = 5 - exp_width: int = 16 - exp_sriov_count: int = 0 - exp_gpu_count_override: Optional[int] = None - exp_max_payload_size: Optional[Union[Dict[int, int], int]] = None - exp_max_rd_req_size: Optional[Union[Dict[int, int], int]] = None - exp_ten_bit_tag_req_en: Optional[Union[Dict[int, int], int]] = None + exp_speed: int = Field(default=5, description="Expected PCIe link speed (generation 1–5).") + exp_width: int = Field(default=16, description="Expected PCIe link width in lanes (1–16).") + exp_sriov_count: int = Field(default=0, description="Expected SR-IOV virtual function count.") + exp_gpu_count_override: Optional[int] = Field( + default=None, description="Override expected GPU count for validation." + ) + exp_max_payload_size: Optional[Union[Dict[int, int], int]] = Field( + default=None, + description="Expected max payload size: int for all devices, or dict keyed by device ID.", + ) + exp_max_rd_req_size: Optional[Union[Dict[int, int], int]] = Field( + default=None, + description="Expected max read request size: int for all devices, or dict keyed by device ID.", + ) + exp_ten_bit_tag_req_en: Optional[Union[Dict[int, int], int]] = Field( + default=None, + description="Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID.", + ) def normalize_to_dict( diff --git a/nodescraper/plugins/inband/process/analyzer_args.py b/nodescraper/plugins/inband/process/analyzer_args.py index 135a472b..39375ed9 100644 --- a/nodescraper/plugins/inband/process/analyzer_args.py +++ b/nodescraper/plugins/inband/process/analyzer_args.py @@ -24,13 +24,21 @@ # ############################################################################### +from pydantic import Field + from nodescraper.models import AnalyzerArgs from nodescraper.plugins.inband.process.processdata import ProcessDataModel class ProcessAnalyzerArgs(AnalyzerArgs): - max_kfd_processes: int = 0 - max_cpu_usage: float = 20.0 + max_kfd_processes: int = Field( + default=0, + description="Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.", + ) + max_cpu_usage: float = Field( + default=20.0, + description="Maximum allowed CPU usage (percent) for process checks.", + ) @classmethod def build_from_model(cls, datamodel: ProcessDataModel) -> "ProcessAnalyzerArgs": diff --git a/nodescraper/plugins/inband/process/collector_args.py b/nodescraper/plugins/inband/process/collector_args.py index 87a65ba5..74099144 100644 --- a/nodescraper/plugins/inband/process/collector_args.py +++ b/nodescraper/plugins/inband/process/collector_args.py @@ -23,9 +23,15 @@ # SOFTWARE. # ############################################################################### +from pydantic import Field from nodescraper.models import CollectorArgs class ProcessCollectorArgs(CollectorArgs): - top_n_process: int = 10 + top_n_process: int = Field( + default=10, + description=( + "Number of top processes by CPU usage to collect " "(e.g. for top -b -n 1 -o %%CPU)." + ), + ) diff --git a/nodescraper/plugins/inband/rocm/analyzer_args.py b/nodescraper/plugins/inband/rocm/analyzer_args.py index f10821ee..cb7f1058 100644 --- a/nodescraper/plugins/inband/rocm/analyzer_args.py +++ b/nodescraper/plugins/inband/rocm/analyzer_args.py @@ -32,10 +32,18 @@ class RocmAnalyzerArgs(AnalyzerArgs): - exp_rocm: Union[str, list] = Field(default_factory=list) - exp_rocm_latest: str = Field(default="") - # Key = sub-version name (e.g. version_rocm); value = expected string or list of allowed strings - exp_rocm_sub_versions: dict[str, Union[str, list]] = Field(default_factory=dict) + exp_rocm: Union[str, list] = Field( + default_factory=list, + description="Expected ROCm version string(s) to match (e.g. from rocminfo).", + ) + exp_rocm_latest: str = Field( + default="", + description="Expected 'latest' ROCm path or version string for versioned installs.", + ) + exp_rocm_sub_versions: dict[str, Union[str, list]] = Field( + default_factory=dict, + description="Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings.", + ) @field_validator("exp_rocm", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/rocm/collector_args.py b/nodescraper/plugins/inband/rocm/collector_args.py index a3be0661..b55213dc 100644 --- a/nodescraper/plugins/inband/rocm/collector_args.py +++ b/nodescraper/plugins/inband/rocm/collector_args.py @@ -23,10 +23,15 @@ # SOFTWARE. # ############################################################################### +from pydantic import Field + from nodescraper.models import CollectorArgs class RocmCollectorArgs(CollectorArgs): """Collector arguments for RocmPlugin.""" - rocm_path: str = "/opt/rocm" + rocm_path: str = Field( + default="/opt/rocm", + description="Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery.", + ) diff --git a/nodescraper/plugins/inband/storage/analyzer_args.py b/nodescraper/plugins/inband/storage/analyzer_args.py index 413c8ec0..fcd5ed10 100644 --- a/nodescraper/plugins/inband/storage/analyzer_args.py +++ b/nodescraper/plugins/inband/storage/analyzer_args.py @@ -31,8 +31,23 @@ class StorageAnalyzerArgs(AnalyzerArgs): - min_required_free_space_abs: Optional[str] = None - min_required_free_space_prct: Optional[int] = None - ignore_devices: Optional[list[str]] = Field(default_factory=list) - check_devices: Optional[list[str]] = Field(default_factory=list) - regex_match: bool = False + min_required_free_space_abs: Optional[str] = Field( + default=None, + description="Minimum required free space per mount (e.g. '10G', '1T').", + ) + min_required_free_space_prct: Optional[int] = Field( + default=None, + description="Minimum required free space as percentage of total (0–100).", + ) + ignore_devices: Optional[list[str]] = Field( + default_factory=list, + description="Mount points or devices to exclude from free-space checks.", + ) + check_devices: Optional[list[str]] = Field( + default_factory=list, + description="If non-empty, only these mount points or devices are checked.", + ) + regex_match: bool = Field( + default=False, + description="If True, match device/mount names with regex; otherwise exact match.", + ) diff --git a/nodescraper/plugins/inband/storage/collector_args.py b/nodescraper/plugins/inband/storage/collector_args.py index 3067f6a8..94222b23 100644 --- a/nodescraper/plugins/inband/storage/collector_args.py +++ b/nodescraper/plugins/inband/storage/collector_args.py @@ -23,9 +23,13 @@ # SOFTWARE. # ############################################################################### +from pydantic import Field from nodescraper.models import CollectorArgs class StorageCollectorArgs(CollectorArgs): - skip_sudo: bool = False + skip_sudo: bool = Field( + default=False, + description="If True, do not use sudo when running df and related storage commands.", + ) diff --git a/nodescraper/plugins/inband/sys_settings/analyzer_args.py b/nodescraper/plugins/inband/sys_settings/analyzer_args.py index 732398f0..594a4ca7 100644 --- a/nodescraper/plugins/inband/sys_settings/analyzer_args.py +++ b/nodescraper/plugins/inband/sys_settings/analyzer_args.py @@ -37,10 +37,16 @@ class SysfsCheck(BaseModel): For directory paths: use pattern (regex); at least one directory entry must match (e.g. ^hsn[0-9]+). """ - path: str - expected: list[str] = Field(default_factory=list) - name: str - pattern: Optional[str] = None + path: str = Field(description="Sysfs path to read (file) or list (directory).") + expected: list[str] = Field( + default_factory=list, + description="For file paths: list of acceptable values; if empty, check passes.", + ) + name: str = Field(description="Display name for this check in reports.") + pattern: Optional[str] = Field( + default=None, + description="For directory paths: regex; at least one entry must match (e.g. ^hsn[0-9]+).", + ) class SysSettingsAnalyzerArgs(AnalyzerArgs): @@ -50,7 +56,10 @@ class SysSettingsAnalyzerArgs(AnalyzerArgs): when collection_args is derived from analysis_args (e.g. by the plugin). """ - checks: Optional[list[SysfsCheck]] = None + checks: Optional[list[SysfsCheck]] = Field( + default=None, + description="List of sysfs checks (path, expected values or pattern, display name).", + ) def paths_to_collect(self) -> list[str]: """Return unique sysfs file paths from checks (those without pattern), for use by the collector.""" diff --git a/nodescraper/plugins/inband/sys_settings/collector_args.py b/nodescraper/plugins/inband/sys_settings/collector_args.py index f2e73ddd..207c46b3 100644 --- a/nodescraper/plugins/inband/sys_settings/collector_args.py +++ b/nodescraper/plugins/inband/sys_settings/collector_args.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -from pydantic import BaseModel +from pydantic import BaseModel, Field class SysSettingsCollectorArgs(BaseModel): @@ -33,5 +33,11 @@ class SysSettingsCollectorArgs(BaseModel): directory_paths: sysfs paths to list (ls -1); use for checks that match entry names by regex. """ - paths: list[str] = [] - directory_paths: list[str] = [] + paths: list[str] = Field( + default_factory=list, + description="Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).", + ) + directory_paths: list[str] = Field( + default_factory=list, + description="Sysfs paths to list (ls -1); used for checks that match entry names by regex.", + ) diff --git a/nodescraper/plugins/inband/sysctl/analyzer_args.py b/nodescraper/plugins/inband/sysctl/analyzer_args.py index 7ecae81c..370e20d5 100644 --- a/nodescraper/plugins/inband/sysctl/analyzer_args.py +++ b/nodescraper/plugins/inband/sysctl/analyzer_args.py @@ -25,22 +25,46 @@ ############################################################################### from typing import Optional +from pydantic import Field + from nodescraper.models import AnalyzerArgs from nodescraper.plugins.inband.sysctl.sysctldata import SysctlDataModel class SysctlAnalyzerArgs(AnalyzerArgs): - exp_vm_swappiness: Optional[int] = None - exp_vm_numa_balancing: Optional[int] = None - exp_vm_oom_kill_allocating_task: Optional[int] = None - exp_vm_compaction_proactiveness: Optional[int] = None - exp_vm_compact_unevictable_allowed: Optional[int] = None - exp_vm_extfrag_threshold: Optional[int] = None - exp_vm_zone_reclaim_mode: Optional[int] = None - exp_vm_dirty_background_ratio: Optional[int] = None - exp_vm_dirty_ratio: Optional[int] = None - exp_vm_dirty_writeback_centisecs: Optional[int] = None - exp_kernel_numa_balancing: Optional[int] = None + exp_vm_swappiness: Optional[int] = Field( + default=None, description="Expected vm.swappiness value." + ) + exp_vm_numa_balancing: Optional[int] = Field( + default=None, description="Expected vm.numa_balancing value." + ) + exp_vm_oom_kill_allocating_task: Optional[int] = Field( + default=None, description="Expected vm.oom_kill_allocating_task value." + ) + exp_vm_compaction_proactiveness: Optional[int] = Field( + default=None, description="Expected vm.compaction_proactiveness value." + ) + exp_vm_compact_unevictable_allowed: Optional[int] = Field( + default=None, description="Expected vm.compact_unevictable_allowed value." + ) + exp_vm_extfrag_threshold: Optional[int] = Field( + default=None, description="Expected vm.extfrag_threshold value." + ) + exp_vm_zone_reclaim_mode: Optional[int] = Field( + default=None, description="Expected vm.zone_reclaim_mode value." + ) + exp_vm_dirty_background_ratio: Optional[int] = Field( + default=None, description="Expected vm.dirty_background_ratio value." + ) + exp_vm_dirty_ratio: Optional[int] = Field( + default=None, description="Expected vm.dirty_ratio value." + ) + exp_vm_dirty_writeback_centisecs: Optional[int] = Field( + default=None, description="Expected vm.dirty_writeback_centisecs value." + ) + exp_kernel_numa_balancing: Optional[int] = Field( + default=None, description="Expected kernel.numa_balancing value." + ) @classmethod def build_from_model(cls, datamodel: SysctlDataModel) -> "SysctlAnalyzerArgs": diff --git a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py index c63cd8db..662caffa 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py @@ -27,9 +27,34 @@ class RedfishEndpointCollectorArgs(BaseModel): - """Collection args: uris to GET.""" + """Collection args: uris to GET (or discover from tree), optional concurrency and tree discovery.""" - uris: list[str] = Field(default_factory=list) + uris: list[str] = Field( + default_factory=list, + description="Redfish URIs to GET. Ignored when discover_tree is True.", + ) + discover_tree: bool = Field( + default=False, + description="If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.", + ) + tree_max_depth: int = Field( + default=2, + ge=1, + le=10, + description="When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).", + ) + tree_max_endpoints: int = Field( + default=0, + ge=0, + le=10_000, + description="When discover_tree is True: max endpoints to discover (0=no limit).", + ) + max_workers: int = Field( + default=1, + ge=1, + le=32, + description="Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.", + ) @field_validator("uris", mode="before") @classmethod diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py index 6323a437..59dd7a8d 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py @@ -129,10 +129,10 @@ def analyze_data( ) if failed: - details = "; ".join(f"{f['uri']} {f['path']}: {f['reason']}" for f in failed) + description = f"Redfish endpoint checks failed: {len(failed)} failure(s)" self._log_event( category=EventCategory.TELEMETRY, - description=f"Redfish endpoint checks failed: {len(failed)} failure(s) — {details}", + description=description, data={"failures": failed}, priority=EventPriority.WARNING, console_log=True, diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py index 39dacf79..0a1305a2 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py @@ -23,15 +23,99 @@ # SOFTWARE. # ############################################################################### -from typing import Optional +from collections import deque +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, Optional +from urllib.parse import urlparse from nodescraper.base import RedfishDataCollector +from nodescraper.connection.redfish import RedfishConnection, RedfishGetResult from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus from nodescraper.models import TaskResult from .collector_args import RedfishEndpointCollectorArgs from .endpoint_data import RedfishEndpointDataModel +ODATA_ID = "@odata.id" +MEMBERS = "Members" + + +def _normalize_path(odata_id: str, api_root: str) -> str: + """Convert @odata.id value (URL or path) to a normalized path under api_root.""" + if not odata_id or not isinstance(odata_id, str): + return "" + s = odata_id.strip() + if s.startswith(("http://", "https://")): + parsed = urlparse(s) + s = parsed.path or "/" + if not s.startswith("/"): + s = "/" + s + s = s.rstrip("/") or "/" + api_root_norm = api_root.strip("/") + if api_root_norm and not s.startswith("/" + api_root_norm): + return "" + return s + + +def _extract_odata_ids(obj: Any) -> list[str]: + """Recursively extract all @odata.id values from a Redfish JSON body.""" + out: list[str] = [] + if isinstance(obj, dict): + if ODATA_ID in obj and isinstance(obj[ODATA_ID], str): + out.append(obj[ODATA_ID]) + for k, v in obj.items(): + if k == MEMBERS and isinstance(v, list): + for item in v: + if ( + isinstance(item, dict) + and ODATA_ID in item + and isinstance(item[ODATA_ID], str) + ): + out.append(item[ODATA_ID]) + elif isinstance(v, dict): + out.extend(_extract_odata_ids(v)) + elif isinstance(v, list): + for item in v: + if isinstance(item, dict): + out.extend(_extract_odata_ids(item)) + return out + + +def _discover_tree( + connection: RedfishConnection, + api_root: str, + max_depth: int, + max_endpoints: int, +) -> tuple[list[str], dict[str, dict], list[RedfishGetResult]]: + """ + Traverse the Redfish resource tree from the service root. + + max_depth matches collection_args.tree_max_depth: 1 = service root only; 2 = root + one link + level; child links are only enqueued when depth + 1 < max_depth (root is depth 0). + """ + root_path = _normalize_path(api_root, api_root) or ("/" + api_root.strip("/")) + seen: set[str] = set() + to_visit: deque[tuple[str, int]] = deque([(root_path, 0)]) + responses: dict[str, dict] = {} + results: list[RedfishGetResult] = [] + while to_visit: + if max_endpoints and len(seen) >= max_endpoints: + break + path, depth = to_visit.popleft() + if path in seen or depth > max_depth: + continue + seen.add(path) + res = connection.run_get(path) + results.append(res) + if res.success and res.data is not None: + responses[path] = res.data + for odata_id in _extract_odata_ids(res.data): + link_path = _normalize_path(odata_id, api_root) + # Follow only if the child depth stays strictly below max_depth (1 = root only). + if link_path and link_path not in seen and depth + 1 < max_depth: + to_visit.append((link_path, depth + 1)) + return sorted(seen), responses, results + def _uris_from_args(args: Optional[RedfishEndpointCollectorArgs]) -> list[str]: """Return list of URIs from collector args.uris.""" @@ -40,6 +124,18 @@ def _uris_from_args(args: Optional[RedfishEndpointCollectorArgs]) -> list[str]: return list(args.uris) if args.uris else [] +def _discover_tree_enabled(args: Optional[RedfishEndpointCollectorArgs]) -> bool: + """True only when tree discovery is explicitly enabled (avoids string/other truthy junk).""" + if args is None: + return False + return getattr(args, "discover_tree", False) is True + + +def _fetch_one(connection_copy: RedfishConnection, path: str) -> RedfishGetResult: + """Run a single GET on a connection copy (used from worker threads).""" + return connection_copy.run_get(path) + + class RedfishEndpointCollector( RedfishDataCollector[RedfishEndpointDataModel, RedfishEndpointCollectorArgs] ): @@ -50,30 +146,106 @@ class RedfishEndpointCollector( def collect_data( self, args: Optional[RedfishEndpointCollectorArgs] = None ) -> tuple[TaskResult, Optional[RedfishEndpointDataModel]]: - """GET each configured Redfish URI via _run_redfish_get() and store the JSON response.""" + """Collect via tree discovery, or via explicit URIs, or skip if neither is configured.""" uris = _uris_from_args(args) + use_tree = _discover_tree_enabled(args) + + # 1) Tree discovery: only when discover_tree is explicitly true + if use_tree: + api_root = getattr(self.connection, "api_root", "redfish/v1") + max_depth = getattr(args, "tree_max_depth", 2) if args else 2 + max_endpoints = (getattr(args, "tree_max_endpoints", 0) or 0) if args else 0 + _paths, responses, results = _discover_tree( + self.connection, + api_root, + max_depth=max_depth, + max_endpoints=max_endpoints, + ) + for res in results: + self.result.artifacts.append(res) + if not res.success and res.error: + self._log_event( + category=EventCategory.RUNTIME, + description=f"Redfish GET failed during tree discovery for {res.path}: {res.error}", + priority=EventPriority.WARNING, + console_log=True, + ) + if not responses: + self.result.message = "No Redfish endpoints discovered from tree" + self.result.status = ExecutionStatus.ERROR + return self.result, None + data = RedfishEndpointDataModel(responses=responses) + self.result.message = f"Collected {len(responses)} Redfish endpoint(s) from tree" + self.result.status = ExecutionStatus.OK + return self.result, data + + # 2) URI list: when discover_tree is false/absent and uris are provided if not uris: - self.result.message = "No Redfish URIs configured" + self.result.message = ( + "No collection mode configured: set collection_args.discover_tree to true " + "or provide collection_args.uris" + ) self.result.status = ExecutionStatus.NOT_RAN return self.result, None - responses: dict[str, dict] = {} + paths = [] for uri in uris: - path = uri + path = uri.strip() if uri else "" if not path: continue if not path.startswith("/"): path = "/" + path - res = self._run_redfish_get(path, log_artifact=True) - if res.success and res.data is not None: - responses[res.path] = res.data - else: - self._log_event( - category=EventCategory.RUNTIME, - description=f"Redfish GET failed for {path}: {res.error or 'unknown'}", - priority=EventPriority.WARNING, - console_log=True, - ) + paths.append(path) + + max_workers = getattr(args, "max_workers", 1) if args else 1 + max_workers = min(max_workers, len(paths)) + + if max_workers <= 1: + # Sequential + responses = {} + for path in paths: + res = self._run_redfish_get(path, log_artifact=True) + if res.success and res.data is not None: + responses[res.path] = res.data + else: + self._log_event( + category=EventCategory.RUNTIME, + description=f"Redfish GET failed for {path}: {res.error or 'unknown'}", + priority=EventPriority.WARNING, + console_log=True, + ) + else: + # Concurrent: one connection copy per worker, collect results in main thread + responses = {} + results = [] + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for path in paths: + conn = self.connection.copy() + futures[executor.submit(_fetch_one, conn, path)] = path + for future in as_completed(futures): + path = futures[future] + try: + res = future.result() + results.append(res) + if res.success and res.data is not None: + responses[res.path] = res.data + else: + self._log_event( + category=EventCategory.RUNTIME, + description=f"Redfish GET failed for {path}: {res.error or 'unknown'}", + priority=EventPriority.WARNING, + console_log=True, + ) + except Exception as e: + self._log_event( + category=EventCategory.RUNTIME, + description=f"Redfish GET failed for {path}: {e!s}", + priority=EventPriority.WARNING, + console_log=True, + ) + for res in results: + self.result.artifacts.append(res) if not responses: self.result.message = "No Redfish endpoints could be read" diff --git a/nodescraper/plugins/regex_search/__init__.py b/nodescraper/plugins/regex_search/__init__.py new file mode 100644 index 00000000..708b6b04 --- /dev/null +++ b/nodescraper/plugins/regex_search/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .regex_search_plugin import RegexSearchPlugin + +__all__ = ["RegexSearchPlugin"] diff --git a/nodescraper/plugins/regex_search/analyzer_args.py b/nodescraper/plugins/regex_search/analyzer_args.py new file mode 100644 index 00000000..b30acb7e --- /dev/null +++ b/nodescraper/plugins/regex_search/analyzer_args.py @@ -0,0 +1,50 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Any, Optional + +from pydantic import Field + +from nodescraper.models import AnalyzerArgs + + +class RegexSearchAnalyzerArgs(AnalyzerArgs): + """Arguments for RegexSearchAnalyzer (dict items match Dmesg-style error_regex).""" + + error_regex: Optional[list[dict[str, Any]]] = Field( + default=None, + description=( + "Regex patterns to search for; each dict may include regex (str), message, " + "event_category, event_priority (same as Dmesg analyzer error_regex). " + ), + ) + interval_to_collapse_event: int = Field( + default=60, + description="Seconds within which repeated events are collapsed into one.", + ) + num_timestamps: int = Field( + default=3, + description="Number of timestamps to include per event in output.", + ) diff --git a/nodescraper/plugins/regex_search/regex_search_analyzer.py b/nodescraper/plugins/regex_search/regex_search_analyzer.py new file mode 100644 index 00000000..0b4384f4 --- /dev/null +++ b/nodescraper/plugins/regex_search/regex_search_analyzer.py @@ -0,0 +1,102 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import os +from typing import Optional, Union + +from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer, RegexEvent +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult + +from .analyzer_args import RegexSearchAnalyzerArgs +from .regex_search_data import RegexSearchData + + +class RegexSearchAnalyzer(RegexAnalyzer[RegexSearchData, RegexSearchAnalyzerArgs]): + """Run user-provided regexes against text loaded from --data (file or directory).""" + + DATA_MODEL = RegexSearchData + + ERROR_REGEX: list[ErrorRegex] = [] + + def _build_regex_event( + self, regex_obj: ErrorRegex, match: Union[str, list[str]], source: str + ) -> RegexEvent: + """Augment the default event text with a file path when the origin is a concrete path. + + Args: + regex_obj: Metadata for the rule that produced the match. + match: Substring or grouped capture text from the pattern. + source: Origin label, or an absolute path when matching per file. + + Returns: + Match record with an extended description when a path-like source is present. + """ + event = super()._build_regex_event(regex_obj, match, source) + if source and source != "regex_search": + event.description = f"{regex_obj.message} [file: {source}]" + return event + + def analyze_data( + self, + data: RegexSearchData, + args: Optional[RegexSearchAnalyzerArgs] = None, + ) -> TaskResult: + """Scan loaded inputs with the given patterns, or mark the task not run if inputs are incomplete. + + Args: + data: Aggregated and per-file text loaded from the user data path. + args: Optional pattern list and timing knobs; omitted or empty patterns skip work. + + Returns: + Work outcome with match events, or a not-run status when patterns are absent. + """ + if args is None or not args.error_regex: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "Analysis args need to be provided for the analyzer to run" + return self.result + + final_regex = self._convert_and_extend_error_regex(args.error_regex, []) + + if data.files: + for rel_path in sorted(data.files.keys()): + file_content = data.files[rel_path] + abs_source = os.path.normpath(os.path.join(data.data_root, rel_path)) + self.result.events += self.check_all_regexes( + content=file_content, + source=abs_source, + error_regex=final_regex, + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, + ) + else: + self.result.events += self.check_all_regexes( + content=data.content, + source=data.data_root or "regex_search", + error_regex=final_regex, + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, + ) + return self.result diff --git a/nodescraper/plugins/regex_search/regex_search_data.py b/nodescraper/plugins/regex_search/regex_search_data.py new file mode 100644 index 00000000..a12b2841 --- /dev/null +++ b/nodescraper/plugins/regex_search/regex_search_data.py @@ -0,0 +1,107 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import os +from pathlib import Path +from typing import Union + +from pydantic import Field + +from nodescraper.models import DataModel +from nodescraper.utils import get_unique_filename + + +class RegexSearchData(DataModel): + """Loaded file or directory contents passed to the analyzer (via --data).""" + + content: str + data_root: str = "" + files: dict[str, str] = Field(default_factory=dict) + + def log_model(self, log_path: str) -> None: + """Persist the aggregated text payload as one log file under the given base path. + + Args: + log_path: Directory where the log file should be written. + + Returns: + None. + """ + log_name = os.path.join(log_path, get_unique_filename(log_path, "regex_search_source.log")) + with open(log_name, "w", encoding="utf-8") as log_file: + log_file.write(self.content) + + @classmethod + def import_model(cls, model_input: Union[dict, str]) -> "RegexSearchData": + """Import datamodel. + + Args: + model_input: Keyed fields for direct validation, or a path string to load from disk. + + Returns: + Instance with content, root path, and per-file bodies filled in. + """ + if isinstance(model_input, dict): + return cls.model_validate(model_input) + if isinstance(model_input, str): + return cls._from_filesystem_path(model_input) + raise ValueError("Invalid input for regex search data") + + @classmethod + def _from_filesystem_path(cls, path: str) -> "RegexSearchData": + """Read one file or every file under a directory into a merged view plus a path-to-text map. + + Args: + path: Absolute or resolvable path to a file or directory. + + Returns: + Instance built from the read text and discovered relative paths. + + """ + path = os.path.abspath(path) + if not os.path.exists(path): + raise FileNotFoundError(f"Path not found: {path}") + if os.path.isfile(path): + text = Path(path).read_text(encoding="utf-8", errors="replace") + rel = os.path.basename(path) + data_root = os.path.dirname(path) or os.path.abspath(os.path.curdir) + return cls(content=text, data_root=data_root, files={rel: text}) + if os.path.isdir(path): + files: dict[str, str] = {} + parts: list[str] = [] + for root, _dirs, filenames in os.walk(path): + for name in sorted(filenames): + fp = os.path.join(root, name) + if not os.path.isfile(fp): + continue + rel = os.path.relpath(fp, path) + try: + text = Path(fp).read_text(encoding="utf-8", errors="replace") + except OSError: + continue + files[rel] = text + parts.append(f"===== {rel} =====\n{text}") + return cls(content="\n".join(parts), data_root=path, files=files) + raise ValueError(f"Unsupported path type: {path}") diff --git a/nodescraper/plugins/regex_search/regex_search_plugin.py b/nodescraper/plugins/regex_search/regex_search_plugin.py new file mode 100644 index 00000000..36d650c6 --- /dev/null +++ b/nodescraper/plugins/regex_search/regex_search_plugin.py @@ -0,0 +1,76 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional, Union + +from nodescraper.connection.inband import InBandConnectionManager, SSHConnectionParams +from nodescraper.enums import EventPriority +from nodescraper.interfaces import DataPlugin +from nodescraper.models import CollectorArgs, TaskResult + +from .analyzer_args import RegexSearchAnalyzerArgs +from .regex_search_analyzer import RegexSearchAnalyzer +from .regex_search_data import RegexSearchData + + +class RegexSearchPlugin( + DataPlugin[ + InBandConnectionManager, + SSHConnectionParams, + RegexSearchData, + CollectorArgs, + RegexSearchAnalyzerArgs, + ] +): + """Analyzer-only plugin: search user regexes against a file or directory (--data).""" + + DATA_MODEL = RegexSearchData + ANALYZER = RegexSearchAnalyzer + + def analyze( + self, + max_event_priority_level: Optional[Union[EventPriority, str]] = EventPriority.CRITICAL, + analysis_args: Optional[Union[RegexSearchAnalyzerArgs, dict]] = None, + data: Optional[Union[str, dict, RegexSearchData]] = None, + ) -> TaskResult: + if analysis_args is None: + missing_error_regex = True + elif isinstance(analysis_args, RegexSearchAnalyzerArgs): + missing_error_regex = not bool(analysis_args.error_regex) + elif isinstance(analysis_args, dict): + er = analysis_args.get("error_regex") + missing_error_regex = er is None or er == [] + else: + missing_error_regex = True + if missing_error_regex: + self.logger.warning( + "RegexSearchPlugin: analysis args need to be provided for the analyzer to run " + "(e.g. --error-regex for each pattern)." + ) + return super().analyze( + max_event_priority_level=max_event_priority_level, + analysis_args=analysis_args, + data=data, + ) diff --git a/nodescraper/typeutils.py b/nodescraper/typeutils.py index 4760d530..cd7a4650 100644 --- a/nodescraper/typeutils.py +++ b/nodescraper/typeutils.py @@ -25,7 +25,7 @@ ############################################################################### import inspect import types -from typing import Any, Callable, Optional, Type, Union, get_args, get_origin +from typing import Annotated, Any, Callable, Optional, Type, Union, get_args, get_origin from pydantic import BaseModel, Field @@ -120,6 +120,9 @@ def process_type(cls, input_type: type[Any]) -> list[TypeClass]: list[TypeClass]: list of TypeClass objects containing type class and inner type information """ origin = get_origin(input_type) + if origin is Annotated: + input_type = get_args(input_type)[0] + origin = get_origin(input_type) if origin is None: return [TypeClass(type_class=input_type)] if origin is Union or getattr(types, "UnionType", None) is origin: diff --git a/test/functional/conftest.py b/test/functional/conftest.py index 77ded955..bce73895 100644 --- a/test/functional/conftest.py +++ b/test/functional/conftest.py @@ -28,9 +28,24 @@ import subprocess import sys from typing import List +from unittest.mock import MagicMock import pytest +from nodescraper.models.systeminfo import OSFamily, SystemInfo + + +@pytest.fixture +def system_info(): + """Minimal SystemInfo for collectors that require it (same shape as test/unit/conftest).""" + return SystemInfo(name="test_host", platform="X", os_family=OSFamily.LINUX, sku="TEST") + + +@pytest.fixture +def redfish_conn_mock(): + """MagicMock Redfish connection for Redfish plugin tests.""" + return MagicMock() + @pytest.fixture def run_cli_command(): @@ -46,7 +61,8 @@ def _run_command(args: List[str], check: bool = False): Returns: subprocess.CompletedProcess instance """ - cmd = [sys.executable, "-m", "nodescraper.cli.cli"] + args + # -W: avoid runpy RuntimeWarning when nodescraper.cli was imported before -m nodescraper.cli.cli + cmd = [sys.executable, "-W", "ignore::RuntimeWarning", "-m", "nodescraper.cli.cli"] + args return subprocess.run( cmd, capture_output=True, diff --git a/test/functional/fixtures/redfish_endpoint_plugin_config_full_args.json b/test/functional/fixtures/redfish_endpoint_plugin_config_full_args.json new file mode 100644 index 00000000..099057c5 --- /dev/null +++ b/test/functional/fixtures/redfish_endpoint_plugin_config_full_args.json @@ -0,0 +1,26 @@ +{ + "plugins": { + "RedfishEndpointPlugin": { + "collection_args": { + "uris": ["/redfish/v1", "/redfish/v1/Systems"], + "discover_tree": false, + "tree_max_depth": 3, + "tree_max_endpoints": 100, + "max_workers": 2 + }, + "analysis_args": { + "checks": { + "/redfish/v1/Systems/1": { + "Status/Health": { "anyOf": ["OK", "Warning"] }, + "PowerState": { "eq": "On" }, + "PowerConsumedWatts": { "min": 0, "max": 500 } + }, + "*": { + "Status/State": { "eq": "Enabled" }, + "Status/Health": { "anyOf": ["OK", "Warning"] } + } + } + } + } + } +} diff --git a/test/functional/fixtures/regex_search_multi_logs/app_error.log b/test/functional/fixtures/regex_search_multi_logs/app_error.log new file mode 100644 index 00000000..391fb32c --- /dev/null +++ b/test/functional/fixtures/regex_search_multi_logs/app_error.log @@ -0,0 +1,3 @@ +startup complete +ERROR: dependency timeout connecting to backend +shutdown clean diff --git a/test/functional/fixtures/regex_search_multi_logs/no_errors.log b/test/functional/fixtures/regex_search_multi_logs/no_errors.log new file mode 100644 index 00000000..854d9322 --- /dev/null +++ b/test/functional/fixtures/regex_search_multi_logs/no_errors.log @@ -0,0 +1,2 @@ +2026-04-09 service healthy +all checks passed diff --git a/test/functional/fixtures/regex_search_multi_logs/storage_warn.log b/test/functional/fixtures/regex_search_multi_logs/storage_warn.log new file mode 100644 index 00000000..bc11a75a --- /dev/null +++ b/test/functional/fixtures/regex_search_multi_logs/storage_warn.log @@ -0,0 +1,3 @@ +mount ok +WARNING: disk full on /data within 5% +idle diff --git a/test/functional/fixtures/regex_search_sample.log b/test/functional/fixtures/regex_search_sample.log new file mode 100644 index 00000000..7564228b --- /dev/null +++ b/test/functional/fixtures/regex_search_sample.log @@ -0,0 +1,4 @@ +2026-04-09T10:00:00Z service started OK +2026-04-09T10:01:00Z ERROR: connection reset by peer +2026-04-09T10:02:00Z routine check passed +2026-04-09T10:03:00Z WARNING: disk full on /var diff --git a/test/functional/test_cli_describe.py b/test/functional/test_cli_describe.py index 52097a54..502bbe85 100644 --- a/test/functional/test_cli_describe.py +++ b/test/functional/test_cli_describe.py @@ -25,10 +25,12 @@ ############################################################################### """Functional tests for CLI describe command.""" +from pathlib import Path + def test_describe_command_list_plugins(run_cli_command): """Test that describe command can list all plugins.""" - result = run_cli_command(["describe", "plugin"]) + result = run_cli_command(["--log-path", "None", "describe", "plugin"]) assert result.returncode == 0 assert len(result.stdout) > 0 @@ -38,7 +40,7 @@ def test_describe_command_list_plugins(run_cli_command): def test_describe_command_single_plugin(run_cli_command): """Test that describe command can describe a single plugin.""" - result = run_cli_command(["describe", "plugin", "BiosPlugin"]) + result = run_cli_command(["--log-path", "None", "describe", "plugin", "BiosPlugin"]) assert result.returncode == 0 assert len(result.stdout) > 0 @@ -48,8 +50,31 @@ def test_describe_command_single_plugin(run_cli_command): def test_describe_invalid_plugin(run_cli_command): """Test that describe command handles invalid plugin gracefully.""" - result = run_cli_command(["describe", "plugin", "NonExistentPlugin"]) + result = run_cli_command(["--log-path", "None", "describe", "plugin", "NonExistentPlugin"]) assert result.returncode != 0 output = (result.stdout + result.stderr).lower() assert "error" in output or "not found" in output or "invalid" in output + + +def test_describe_no_console_log_writes_nodescraper_log(run_cli_command, tmp_path): + """With --no-console-log, describe output is only in nodescraper.log under scraper_logs_*.""" + log_base = str(tmp_path / "logs") + result = run_cli_command( + [ + "--log-path", + log_base, + "--no-console-log", + "describe", + "plugin", + "BiosPlugin", + ], + check=False, + ) + assert result.returncode == 0 + run_dirs = list(Path(log_base).glob("scraper_logs_*")) + assert len(run_dirs) == 1 + log_file = run_dirs[0] / "nodescraper.log" + assert log_file.is_file() + text = log_file.read_text(encoding="utf-8").lower() + assert "bios" in text diff --git a/test/functional/test_cli_help.py b/test/functional/test_cli_help.py index a1bd90ae..f88278f1 100644 --- a/test/functional/test_cli_help.py +++ b/test/functional/test_cli_help.py @@ -28,6 +28,12 @@ import subprocess import sys +import pytest + +from nodescraper.pluginregistry import PluginRegistry + +_REGISTERED_PLUGIN_NAMES = tuple(sorted(PluginRegistry().plugins.keys())) + def test_help_command(): """Test that node-scraper -h displays help information.""" @@ -67,3 +73,29 @@ def test_help_shows_subcommands(): assert result.returncode == 0 output = result.stdout.lower() assert "run-plugins" in output or "commands:" in output or "positional arguments:" in output + + +@pytest.mark.parametrize("plugin_name", _REGISTERED_PLUGIN_NAMES) +def test_run_plugins_help_for_each_registered_plugin(plugin_name, tmp_path): + """``run-plugins -h`` must succeed for every plugin in the registry.""" + log_path = str(tmp_path / "logs") + result = subprocess.run( + [ + sys.executable, + "-m", + "nodescraper.cli.cli", + "--log-path", + log_path, + "run-plugins", + plugin_name, + "-h", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 0, ( + f"run-plugins {plugin_name} -h failed:\n" + f"stdout={result.stdout!r}\nstderr={result.stderr!r}" + ) + combined = (result.stdout or "") + (result.stderr or "") + assert "usage:" in combined.lower() diff --git a/test/functional/test_cli_no_console_log.py b/test/functional/test_cli_no_console_log.py new file mode 100644 index 00000000..ed9fd8fa --- /dev/null +++ b/test/functional/test_cli_no_console_log.py @@ -0,0 +1,50 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import subprocess +import sys + + +def test_no_console_log_with_log_path_none_still_parses(): + """--log-path None + --no-console-log defaults to ./scraper_logs_* (no argparse error).""" + result = subprocess.run( + [ + sys.executable, + "-m", + "nodescraper.cli.cli", + "--log-path", + "None", + "--no-console-log", + "run-plugins", + "OsPlugin", + "-h", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 0, result.stderr + combined = (result.stdout or "") + (result.stderr or "") + assert "no-console-log requires" not in combined.lower() diff --git a/test/functional/test_redfish_endpoint_plugin.py b/test/functional/test_redfish_endpoint_plugin.py index 2a25043e..f2513b62 100644 --- a/test/functional/test_redfish_endpoint_plugin.py +++ b/test/functional/test_redfish_endpoint_plugin.py @@ -23,92 +23,368 @@ # SOFTWARE. # ############################################################################### -from pathlib import Path +from unittest.mock import MagicMock import pytest +from nodescraper.connection.redfish import RedfishGetResult +from nodescraper.enums import EventCategory, ExecutionStatus +from nodescraper.plugins.ooband.redfish_endpoint import ( + RedfishEndpointCollector, + RedfishEndpointCollectorArgs, +) +from nodescraper.plugins.ooband.redfish_endpoint import endpoint_collector as ec + @pytest.fixture -def fixtures_dir(): - """Return path to functional test fixtures directory.""" - return Path(__file__).parent / "fixtures" +def redfish_endpoint_collector(system_info, redfish_conn_mock): + return RedfishEndpointCollector( + system_info=system_info, + connection=redfish_conn_mock, + ) -@pytest.fixture -def redfish_plugin_config(fixtures_dir): - """Path to RedfishEndpointPlugin config (URIs + checks).""" - return fixtures_dir / "redfish_endpoint_plugin_config.json" +def test_redfish_endpoint_collector_no_uris(redfish_endpoint_collector): + result, data = redfish_endpoint_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert ( + result.message + == "No collection mode configured: set collection_args.discover_tree to true or provide collection_args.uris" + ) + assert data is None -@pytest.fixture -def redfish_connection_config(fixtures_dir): - """Path to Redfish connection config (RedfishConnectionManager).""" - return fixtures_dir / "redfish_connection_config.json" +def test_redfish_endpoint_collector_no_uris_with_args(redfish_endpoint_collector): + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(uris=[]) + ) + assert result.status == ExecutionStatus.NOT_RAN + assert data is None -def test_redfish_endpoint_plugin_with_config_and_connection( - run_cli_command, redfish_plugin_config, redfish_connection_config, tmp_path -): - assert redfish_plugin_config.exists(), f"Config not found: {redfish_plugin_config}" - assert redfish_connection_config.exists(), f"Config not found: {redfish_connection_config}" +def test_redfish_endpoint_collector_one_uri_success(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"Name": "Root"}, + status_code=200, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(uris=["/redfish/v1"]) + ) + assert result.status == ExecutionStatus.OK + assert result.message == "Collected 1 Redfish endpoint(s)" + assert data is not None + assert data.responses["/redfish/v1"]["Name"] == "Root" + redfish_conn_mock.run_get.assert_called_once() + call_path = redfish_conn_mock.run_get.call_args[0][0] + assert call_path == "/redfish/v1" or call_path.strip("/") == "redfish/v1" - log_path = str(tmp_path / "logs_redfish") - result = run_cli_command( - [ - "--log-path", - log_path, - "--connection-config", - str(redfish_connection_config), - "--plugin-configs=" + str(redfish_plugin_config), - "run-plugins", - "RedfishEndpointPlugin", - ], - check=False, + +def test_redfish_endpoint_collector_uri_normalized_with_leading_slash( + redfish_endpoint_collector, redfish_conn_mock +): + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1/Systems", + success=True, + data={"Members": []}, + status_code=200, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(uris=["redfish/v1/Systems"]) ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert "/redfish/v1/Systems" in data.responses or "redfish/v1/Systems" in data.responses - output = result.stdout + result.stderr - assert "RedfishEndpointPlugin" in output or "Redfish" in output +def test_redfish_endpoint_collector_one_fail_no_success( + redfish_endpoint_collector, redfish_conn_mock +): + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=False, + error="Connection refused", + status_code=None, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(uris=["/redfish/v1"]) + ) + assert result.status == ExecutionStatus.ERROR + assert result.message.startswith("No Redfish endpoints could be read") + assert data is None + assert len(result.events) >= 1 + assert any( + e.category == EventCategory.RUNTIME.value or "Redfish GET failed" in (e.description or "") + for e in result.events + ) -def test_redfish_endpoint_plugin_plugin_config_only( - run_cli_command, redfish_plugin_config, tmp_path + +def test_redfish_endpoint_collector_mixed_success_fail( + redfish_endpoint_collector, redfish_conn_mock ): - assert redfish_plugin_config.exists() + def run_get_side_effect(path): + path_str = str(path) + if "Systems" in path_str: + return RedfishGetResult( + path=path_str if path_str.startswith("/") else "/" + path_str, + success=True, + data={"Id": "1"}, + status_code=200, + ) + return RedfishGetResult( + path=path_str if path_str.startswith("/") else "/" + path_str, + success=False, + error="Not Found", + status_code=404, + ) - log_path = str(tmp_path / "logs_redfish_noconn") - result = run_cli_command( - [ - "--log-path", - log_path, - "--plugin-configs=" + str(redfish_plugin_config), - "run-plugins", - "RedfishEndpointPlugin", - ], - check=False, + redfish_conn_mock.run_get.side_effect = run_get_side_effect + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(uris=["/redfish/v1/Systems", "/redfish/v1/Bad"]) ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.responses) == 1 + keys = list(data.responses.keys()) + assert any("Systems" in k for k in keys) + assert list(data.responses.values())[0].get("Id") == "1" - output = result.stdout + result.stderr - assert "RedfishEndpointPlugin" in output or "Redfish" in output +def test_normalize_path_empty_or_invalid(): + api = "redfish/v1" + assert ec._normalize_path("", api) == "" + assert ec._normalize_path(None, api) == "" # type: ignore[arg-type] + assert ec._normalize_path(" ", api) == "" -def test_redfish_endpoint_plugin_default_subcommand( - run_cli_command, redfish_plugin_config, redfish_connection_config, tmp_path -): - assert redfish_plugin_config.exists() - assert redfish_connection_config.exists() - - log_path = str(tmp_path / "logs_redfish_default") - result = run_cli_command( - [ - "--log-path", - log_path, - "--connection-config", - str(redfish_connection_config), - "--plugin-configs=" + str(redfish_plugin_config), - "RedfishEndpointPlugin", - ], - check=False, - ) - - output = result.stdout + result.stderr - assert "RedfishEndpointPlugin" in output or "Redfish" in output + +def test_normalize_path_relative_path(): + api = "redfish/v1" + assert ec._normalize_path("/redfish/v1", api) == "/redfish/v1" + assert ec._normalize_path("/redfish/v1/", api) == "/redfish/v1" + assert ec._normalize_path("redfish/v1/Systems", api) == "/redfish/v1/Systems" + assert ec._normalize_path(" /redfish/v1/Chassis ", api) == "/redfish/v1/Chassis" + + +def test_normalize_path_full_url(): + api = "redfish/v1" + assert ec._normalize_path("https://host/redfish/v1/Systems", api) == "/redfish/v1/Systems" + assert ec._normalize_path("http://bmc/redfish/v1", api) == "/redfish/v1" + + +def test_normalize_path_outside_api_root(): + api = "redfish/v1" + assert ec._normalize_path("/other/root", api) == "" + assert ec._normalize_path("https://host/other/path", api) == "" + + +def test_extract_odata_ids_empty(): + assert ec._extract_odata_ids({}) == [] + assert ec._extract_odata_ids([]) == [] + assert ec._extract_odata_ids("x") == [] + + +def test_extract_odata_ids_single(): + assert ec._extract_odata_ids({"@odata.id": "/redfish/v1"}) == ["/redfish/v1"] + assert ec._extract_odata_ids({"@odata.id": "https://host/redfish/v1"}) == [ + "https://host/redfish/v1" + ] + + +def test_extract_odata_ids_members(): + body = { + "Members": [ + {"@odata.id": "/redfish/v1/Systems/1"}, + {"@odata.id": "/redfish/v1/Systems/2"}, + ] + } + assert set(ec._extract_odata_ids(body)) == {"/redfish/v1/Systems/1", "/redfish/v1/Systems/2"} + + +def test_extract_odata_ids_nested_and_members(): + body = { + "@odata.id": "/redfish/v1", + "Systems": {"@odata.id": "/redfish/v1/Systems", "Members": []}, + "Chassis": { + "Members": [{"@odata.id": "/redfish/v1/Chassis/1"}], + }, + } + ids = ec._extract_odata_ids(body) + assert "/redfish/v1" in ids + assert "/redfish/v1/Systems" in ids + assert "/redfish/v1/Chassis/1" in ids + + +def test_uris_from_args_none(): + assert ec._uris_from_args(None) == [] + + +def test_uris_from_args_empty(): + assert ec._uris_from_args(RedfishEndpointCollectorArgs(uris=[])) == [] + + +def test_uris_from_args_with_uris(): + args = RedfishEndpointCollectorArgs(uris=["/redfish/v1", "/redfish/v1/Systems"]) + assert ec._uris_from_args(args) == ["/redfish/v1", "/redfish/v1/Systems"] + + +def test_fetch_one_calls_run_get(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", success=True, data={}, status_code=200 + ) + out = ec._fetch_one(conn, "/redfish/v1") + conn.run_get.assert_called_once_with("/redfish/v1") + assert out.success is True + assert out.path == "/redfish/v1" + + +def test_discover_tree_single_root(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Name": "Root"}, + status_code=200, + ) + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=2, max_endpoints=0 + ) + assert paths == ["/redfish/v1"] + assert list(responses.keys()) == ["/redfish/v1"] + assert responses["/redfish/v1"]["Name"] == "Root" + assert len(results) == 1 + conn.run_get.assert_called_once_with("/redfish/v1") + + +def test_discover_tree_follows_links(): + conn = MagicMock() + root_data = { + "@odata.id": "/redfish/v1", + "Systems": {"@odata.id": "/redfish/v1/Systems"}, + } + systems_data = { + "@odata.id": "/redfish/v1/Systems", + "Members": [{"@odata.id": "/redfish/v1/Systems/1"}], + } + system1_data = {"@odata.id": "/redfish/v1/Systems/1", "Id": "1"} + + def run_get(path): + if path == "/redfish/v1": + return RedfishGetResult(path=path, success=True, data=root_data, status_code=200) + if path == "/redfish/v1/Systems": + return RedfishGetResult(path=path, success=True, data=systems_data, status_code=200) + if path == "/redfish/v1/Systems/1": + return RedfishGetResult(path=path, success=True, data=system1_data, status_code=200) + return RedfishGetResult(path=path, success=False, error="Not Found", status_code=404) + + conn.run_get.side_effect = run_get + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=3, max_endpoints=0 + ) + assert "/redfish/v1" in paths + assert "/redfish/v1/Systems" in paths + assert "/redfish/v1/Systems/1" in paths + assert responses["/redfish/v1"]["@odata.id"] == "/redfish/v1" + assert responses["/redfish/v1/Systems"]["@odata.id"] == "/redfish/v1/Systems" + assert responses["/redfish/v1/Systems/1"]["Id"] == "1" + assert len(results) >= 3 + + +def test_discover_tree_respects_max_depth(): + conn = MagicMock() + root_data = {"@odata.id": "/redfish/v1", "Systems": {"@odata.id": "/redfish/v1/Systems"}} + systems_data = { + "@odata.id": "/redfish/v1/Systems", + "Members": [{"@odata.id": "/redfish/v1/Systems/1"}], + } + + def run_get(path): + if path == "/redfish/v1": + return RedfishGetResult(path=path, success=True, data=root_data, status_code=200) + if path == "/redfish/v1/Systems": + return RedfishGetResult(path=path, success=True, data=systems_data, status_code=200) + return RedfishGetResult(path=path, success=True, data={}, status_code=200) + + conn.run_get.side_effect = run_get + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=1, max_endpoints=0 + ) + assert "/redfish/v1" in paths + assert "/redfish/v1/Systems" not in paths + assert len(responses) == 1 + + +def test_discover_tree_respects_max_endpoints(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Systems": {"@odata.id": "/redfish/v1/Systems"}}, + status_code=200, + ) + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=5, max_endpoints=1 + ) + assert len(paths) == 1 + assert len(responses) == 1 + conn.run_get.assert_called_once() + + +def test_collect_data_discover_tree_success(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.api_root = "redfish/v1" + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Name": "Root"}, + status_code=200, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(discover_tree=True) + ) + assert result.status == ExecutionStatus.OK + assert result.message == "Collected 1 Redfish endpoint(s) from tree" + assert data is not None + assert "/redfish/v1" in data.responses + assert data.responses["/redfish/v1"]["Name"] == "Root" + + +def test_collect_data_discover_tree_no_responses(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.api_root = "redfish/v1" + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", success=False, error="Connection refused", status_code=None + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(discover_tree=True) + ) + assert result.status == ExecutionStatus.ERROR + assert "No Redfish endpoints discovered from tree" in result.message + assert data is None + + +def test_collect_data_concurrent_two_uris(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.copy.return_value = redfish_conn_mock + call_count = 0 + + def run_get(path): + nonlocal call_count + call_count += 1 + return RedfishGetResult( + path=path if path.startswith("/") else "/" + path, + success=True, + data={"path": path}, + status_code=200, + ) + + redfish_conn_mock.run_get.side_effect = run_get + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs( + uris=["/redfish/v1", "/redfish/v1/Systems"], max_workers=2 + ) + ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.responses) == 2 + assert "/redfish/v1" in data.responses or "redfish/v1" in data.responses + assert any("Systems" in k for k in data.responses) + assert redfish_conn_mock.copy.called diff --git a/test/unit/cli/test_cli_no_console_stdout.py b/test/unit/cli/test_cli_no_console_stdout.py new file mode 100644 index 00000000..775bccdf --- /dev/null +++ b/test/unit/cli/test_cli_no_console_stdout.py @@ -0,0 +1,152 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import io +import json +from contextlib import redirect_stdout +from unittest.mock import MagicMock, patch + +import pytest + +from nodescraper.cli.cli import main + + +def _assert_main_leaves_stdout_empty(argv: list[str]) -> None: + out = io.StringIO() + with redirect_stdout(out): + with pytest.raises(SystemExit) as exc: + main(argv) + assert out.getvalue() == "", f"Unexpected stdout: {out.getvalue()!r}" + code = exc.value.code + if code is None: + code = 0 + assert code in (0, 1), f"Unexpected exit code: {exc.value.code!r}" + + +@pytest.fixture +def no_console_base(tmp_path): + log_base = tmp_path / "logs" + log_base.mkdir(parents=True, exist_ok=True) + return ["--log-path", str(log_base), "--no-console-log", "--log-level", "ERROR"] + + +def test_describe_no_stdout(no_console_base): + _assert_main_leaves_stdout_empty( + no_console_base + ["describe", "plugin", "BiosPlugin"], + ) + + +def test_summary_no_stdout(no_console_base, tmp_path): + search = tmp_path / "search_here" + search.mkdir() + _assert_main_leaves_stdout_empty( + no_console_base + ["summary", "--search-path", str(search)], + ) + + +def test_gen_plugin_config_no_stdout(no_console_base, tmp_path): + out_dir = tmp_path / "cfg_out" + out_dir.mkdir() + _assert_main_leaves_stdout_empty( + no_console_base + + [ + "gen-plugin-config", + "--plugins", + "BiosPlugin", + "--output-path", + str(out_dir), + "--config-name", + "out_config.json", + ], + ) + + +def test_compare_runs_no_stdout(no_console_base, tmp_path): + d1 = tmp_path / "run_a" + d2 = tmp_path / "run_b" + d1.mkdir() + d2.mkdir() + _assert_main_leaves_stdout_empty( + no_console_base + ["compare-runs", str(d1), str(d2)], + ) + + +def test_run_plugins_empty_config_no_stdout(no_console_base, tmp_path): + cfg = tmp_path / "empty_plugins.json" + cfg.write_text( + json.dumps( + { + "name": "empty", + "desc": "", + "plugins": {}, + "global_args": {}, + "result_collators": {}, + } + ), + encoding="utf-8", + ) + _assert_main_leaves_stdout_empty( + no_console_base + ["run-plugins", "--plugin-configs", str(cfg)], + ) + + +@patch("nodescraper.cli.cli.get_oem_diagnostic_allowable_values", return_value=["DiagTypeA"]) +@patch("nodescraper.cli.cli.RedfishConnection") +def test_show_redfish_oem_allowable_no_stdout( + mock_conn_cls, + _mock_get_allowable, + no_console_base, + tmp_path, +): + conn_path = tmp_path / "conn.json" + conn_path.write_text( + json.dumps( + { + "RedfishConnectionManager": { + "host": "127.0.0.1", + "username": "u", + "password": "p", + "verify_ssl": False, + } + } + ), + encoding="utf-8", + ) + mock_inst = MagicMock() + mock_conn_cls.return_value = mock_inst + + _assert_main_leaves_stdout_empty( + no_console_base + + [ + "--connection-config", + str(conn_path), + "show-redfish-oem-allowable", + "--log-service-path", + "redfish/v1/Systems/1/LogServices/Logs", + ], + ) + mock_inst._ensure_session.assert_called_once() + mock_inst.close.assert_called_once() diff --git a/test/unit/framework/test_cli.py b/test/unit/framework/test_cli.py index 014befa8..8df56e95 100644 --- a/test/unit/framework/test_cli.py +++ b/test/unit/framework/test_cli.py @@ -148,6 +148,25 @@ def test_system_info_builder(): [], ), ), + ( + [ + "run-plugins", + "RegexSearchPlugin", + "--error-regex", + '{"regex":"a","message":"b","event_category":"UNKNOWN"}', + ], + ["RegexSearchPlugin"], + ( + ["run-plugins"], + { + "RegexSearchPlugin": [ + "--error-regex", + '{"regex":"a","message":"b","event_category":"UNKNOWN"}', + ], + }, + [], + ), + ), ], ) def test_process_args(raw_arg_input, plugin_names, exp_output): diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index 6bc40330..f3966c97 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -461,8 +461,8 @@ def test_check_static_data_mismatch(mock_analyzer): assert len(analyzer.result.events) >= 1 -def test_check_pldm_version_success(mock_analyzer): - """Test check_pldm_version passes when PLDM version matches.""" +def test_check_firmware_versions_pldm_success(mock_analyzer): + """Test check_firmware_versions passes when PLDM version matches.""" analyzer = mock_analyzer firmware_data = [ @@ -474,13 +474,13 @@ def test_check_pldm_version_success(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.3") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.3"}) assert len(analyzer.result.events) == 0 -def test_check_pldm_version_mismatch(mock_analyzer): - """Test check_pldm_version logs error when PLDM version doesn't match.""" +def test_check_firmware_versions_pldm_mismatch(mock_analyzer): + """Test check_firmware_versions logs error when PLDM version doesn't match.""" analyzer = mock_analyzer firmware_data = [ @@ -492,14 +492,14 @@ def test_check_pldm_version_mismatch(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.4") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.4"}) assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR -def test_check_pldm_version_missing(mock_analyzer): - """Test check_pldm_version handles missing PLDM firmware.""" +def test_check_firmware_versions_pldm_missing(mock_analyzer): + """Test check_firmware_versions handles missing PLDM firmware.""" analyzer = mock_analyzer firmware_data = [ @@ -511,12 +511,51 @@ def test_check_pldm_version_missing(mock_analyzer): ), ] - analyzer.check_pldm_version(firmware_data, "1.2.3") + analyzer.check_firmware_versions(firmware_data, {"PLDM_BUNDLE": "1.2.3"}) assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR +def test_check_firmware_versions_multiple_fw_ids_success(mock_analyzer): + """Test check_firmware_versions passes when all fw_ids match on each GPU.""" + analyzer = mock_analyzer + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + FwListItem(fw_id="OTHER_FW", fw_version="9.0"), + ], + ), + ] + analyzer.check_firmware_versions( + firmware_data, + {"PLDM_BUNDLE": "1.2.3", "OTHER_FW": "9.0"}, + ) + assert len(analyzer.result.events) == 0 + + +def test_check_firmware_versions_one_id_mismatch(mock_analyzer): + """Test check_firmware_versions errors when any fw_id version differs.""" + analyzer = mock_analyzer + firmware_data = [ + Fw( + gpu=0, + fw_list=[ + FwListItem(fw_id="PLDM_BUNDLE", fw_version="1.2.3"), + FwListItem(fw_id="OTHER_FW", fw_version="8.0"), + ], + ), + ] + analyzer.check_firmware_versions( + firmware_data, + {"PLDM_BUNDLE": "1.2.3", "OTHER_FW": "9.0"}, + ) + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + + def test_check_expected_memory_partition_mode_success(mock_analyzer): """Test check_expected_memory_partition_mode passes when partition modes match.""" analyzer = mock_analyzer diff --git a/test/unit/plugin/test_network_analyzer.py b/test/unit/plugin/test_network_analyzer.py new file mode 100644 index 00000000..e886b765 --- /dev/null +++ b/test/unit/plugin/test_network_analyzer.py @@ -0,0 +1,314 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest + +from nodescraper.enums import EventPriority, ExecutionStatus +from nodescraper.plugins.inband.network.analyzer_args import NetworkAnalyzerArgs +from nodescraper.plugins.inband.network.network_analyzer import NetworkAnalyzer +from nodescraper.plugins.inband.network.networkdata import ( + EthtoolInfo, + NetworkDataModel, +) + + +@pytest.fixture +def network_analyzer(system_info): + return NetworkAnalyzer(system_info) + + +@pytest.fixture +def clean_ethtool_info(): + """EthtoolInfo with no errors (all counters zero).""" + return EthtoolInfo( + interface="eth0", + raw_output="dummy output", + statistics={ + "tx_pfc_frames": "0", + "tx_pfc_ena_frames_pri0": "0", + "tx_pfc_ena_frames_pri1": "0", + "tx_pfc_ena_frames_pri2": "0", + "tx_pfc_ena_frames_pri3": "0", + "tx_pfc_ena_frames_pri4": "0", + "tx_pfc_ena_frames_pri5": "0", + "tx_pfc_ena_frames_pri6": "0", + "tx_pfc_ena_frames_pri7": "0", + "pfc_pri0_tx_transitions": "0", + "pfc_pri1_tx_transitions": "0", + "pfc_pri2_tx_transitions": "0", + "pfc_pri3_tx_transitions": "0", + "pfc_pri4_tx_transitions": "0", + "pfc_pri5_tx_transitions": "0", + "pfc_pri6_tx_transitions": "0", + "pfc_pri7_tx_transitions": "0", + "some_other_stat": "100", # Should be ignored + "rx_bytes": "1234567", # Should be ignored + }, + ) + + +@pytest.fixture +def clean_network_model(clean_ethtool_info): + """Network data with no errors (all counters zero).""" + return NetworkDataModel( + ethtool_info={ + "eth0": clean_ethtool_info, + } + ) + + +def test_no_errors_detected(network_analyzer, clean_network_model): + """Test with nominal data that has no errors.""" + result = network_analyzer.analyze_data(clean_network_model) + assert result.status == ExecutionStatus.OK + assert "No network errors detected" in result.message + assert len(result.events) == 0 + + +def test_single_error_detected(network_analyzer, clean_ethtool_info): + """Test with data containing a single error.""" + clean_ethtool_info.statistics["tx_pfc_frames"] = "5" + model = NetworkDataModel(ethtool_info={"eth0": clean_ethtool_info}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "Network errors detected in statistics" in result.message + assert len(result.events) == 1 + assert result.events[0].description == "Network error detected on eth0: [tx_pfc_frames]" + assert result.events[0].priority == EventPriority.ERROR + assert result.events[0].data["errors"] == {"tx_pfc_frames": 5} + assert result.events[0].data["interface"] == "eth0" + + +def test_multiple_errors_same_interface(network_analyzer, clean_ethtool_info): + """Test with data containing multiple errors on the same interface.""" + clean_ethtool_info.statistics["tx_pfc_frames"] = "10" + clean_ethtool_info.statistics["tx_pfc_ena_frames_pri0"] = "3" + clean_ethtool_info.statistics["pfc_pri2_tx_transitions"] = "7" + model = NetworkDataModel(ethtool_info={"eth0": clean_ethtool_info}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "Network errors detected in statistics" in result.message + assert len(result.events) == 1 # one event per interface + assert result.events[0].priority == EventPriority.ERROR + # Check all 3 errors are present + assert len(result.events[0].data["errors"]) == 3 + assert result.events[0].data["errors"]["tx_pfc_frames"] == 10 + assert result.events[0].data["errors"]["tx_pfc_ena_frames_pri0"] == 3 + assert result.events[0].data["errors"]["pfc_pri2_tx_transitions"] == 7 + + +def test_multiple_interfaces_with_errors(network_analyzer): + """Test with errors across multiple interfaces.""" + eth0 = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "tx_pfc_frames": "15", + "tx_pfc_ena_frames_pri1": "0", + }, + ) + eth1 = EthtoolInfo( + interface="eth1", + raw_output="dummy", + statistics={ + "pfc_pri3_tx_transitions": "8", + }, + ) + eth2 = EthtoolInfo( + interface="eth2", + raw_output="dummy", + statistics={ + "tx_pfc_ena_frames_pri7": "100", + }, + ) + model = NetworkDataModel( + ethtool_info={ + "eth0": eth0, + "eth1": eth1, + "eth2": eth2, + } + ) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 3 + interfaces = {event.data["interface"] for event in result.events} + assert interfaces == {"eth0", "eth1", "eth2"} + + +def test_empty_ethtool_info(network_analyzer): + """Test with empty ethtool_info: WARNING and message logged.""" + model = NetworkDataModel(ethtool_info={}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.WARNING + assert result.message == "No network devices found" + + +def test_regex_patterns_priority_numbers(network_analyzer): + """Test that regex patterns match various priority numbers (0-7 and beyond).""" + ethtool = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "tx_pfc_ena_frames_pri0": "1", + "tx_pfc_ena_frames_pri3": "2", + "tx_pfc_ena_frames_pri7": "3", + "tx_pfc_ena_frames_pri10": "4", # Test double-digit + "pfc_pri0_tx_transitions": "5", + "pfc_pri5_tx_transitions": "6", + "pfc_pri15_tx_transitions": "7", # Test double-digit + }, + ) + model = NetworkDataModel(ethtool_info={"eth0": ethtool}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 + # All 7 errors should be detected + assert len(result.events[0].data["errors"]) == 7 + + +def test_non_numeric_values_ignored(network_analyzer): + """Test that non-numeric values in statistics are gracefully ignored.""" + ethtool = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "tx_pfc_frames": "N/A", # Non-numeric + "tx_pfc_ena_frames_pri0": "invalid", # Non-numeric + "pfc_pri1_tx_transitions": "5", # Valid error + }, + ) + model = NetworkDataModel(ethtool_info={"eth0": ethtool}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 + # Only the valid numeric error should be reported + assert len(result.events[0].data["errors"]) == 1 + assert result.events[0].data["errors"]["pfc_pri1_tx_transitions"] == 5 + + +def test_zero_values_not_reported(network_analyzer): + """Test that zero values are not reported as errors.""" + ethtool = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "tx_pfc_frames": "0", + "tx_pfc_ena_frames_pri0": "0", + "pfc_pri1_tx_transitions": "0", + }, + ) + model = NetworkDataModel(ethtool_info={"eth0": ethtool}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_non_matching_fields_ignored(network_analyzer): + """Test that statistics not matching error patterns are ignored.""" + ethtool = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "rx_bytes": "999999999", # High value but not an error field + "tx_bytes": "888888888", # High value but not an error field + "some_random_counter": "12345", # Not an error field + "tx_pfc_frames": "5", # This SHOULD be detected + }, + ) + model = NetworkDataModel(ethtool_info={"eth0": ethtool}) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 + # Only tx_pfc_frames should be reported + assert len(result.events[0].data["errors"]) == 1 + assert "tx_pfc_frames" in result.events[0].data["errors"] + + +def test_mixed_interfaces_with_and_without_errors(network_analyzer): + """Test with some interfaces having errors and others clean.""" + eth0_error = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "tx_pfc_frames": "10", + }, + ) + eth1_clean = EthtoolInfo( + interface="eth1", + raw_output="dummy", + statistics={ + "tx_pfc_frames": "0", + "tx_pfc_ena_frames_pri0": "0", + }, + ) + eth2_error = EthtoolInfo( + interface="eth2", + raw_output="dummy", + statistics={ + "pfc_pri5_tx_transitions": "20", + }, + ) + model = NetworkDataModel( + ethtool_info={ + "eth0": eth0_error, + "eth1": eth1_clean, + "eth2": eth2_error, + } + ) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + # Only 2 events (eth0 and eth2), eth1 should not generate an event + assert len(result.events) == 2 + interfaces_with_errors = {event.data["interface"] for event in result.events} + assert interfaces_with_errors == {"eth0", "eth2"} + + +def test_custom_error_regex_detected(network_analyzer): + """Test that custom regex in analyzer args is applied in addition to defaults.""" + ethtool = EthtoolInfo( + interface="eth0", + raw_output="dummy", + statistics={ + "custom_tx_drops": "9", # Matched via custom regex only + "tx_pfc_frames": "0", + }, + ) + model = NetworkDataModel(ethtool_info={"eth0": ethtool}) + args = NetworkAnalyzerArgs( + error_regex=[ + { + "regex": r"^custom_tx_drops$", + "message": "Custom tx drops", + "event_category": "NETWORK", + } + ] + ) + + result = network_analyzer.analyze_data(model, args=args) + + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 + assert result.events[0].data["interface"] == "eth0" + assert result.events[0].data["errors"] == {"custom_tx_drops": 9} diff --git a/test/unit/plugin/test_redfish_endpoint_collector.py b/test/unit/plugin/test_redfish_endpoint_collector.py index c2893ee1..7a786409 100644 --- a/test/unit/plugin/test_redfish_endpoint_collector.py +++ b/test/unit/plugin/test_redfish_endpoint_collector.py @@ -23,6 +23,8 @@ # SOFTWARE. # ############################################################################### +from unittest.mock import MagicMock + import pytest from nodescraper.connection.redfish import RedfishGetResult @@ -31,6 +33,7 @@ RedfishEndpointCollector, RedfishEndpointCollectorArgs, ) +from nodescraper.plugins.ooband.redfish_endpoint import endpoint_collector as ec @pytest.fixture @@ -44,7 +47,7 @@ def redfish_endpoint_collector(system_info, redfish_conn_mock): def test_redfish_endpoint_collector_no_uris(redfish_endpoint_collector): result, data = redfish_endpoint_collector.collect_data() assert result.status == ExecutionStatus.NOT_RAN - assert result.message == "No Redfish URIs configured" + assert "No collection mode configured" in result.message assert data is None @@ -53,6 +56,7 @@ def test_redfish_endpoint_collector_no_uris_with_args(redfish_endpoint_collector args=RedfishEndpointCollectorArgs(uris=[]) ) assert result.status == ExecutionStatus.NOT_RAN + assert "No collection mode configured" in result.message assert data is None @@ -143,3 +147,242 @@ def run_get_side_effect(path): keys = list(data.responses.keys()) assert any("Systems" in k for k in keys) assert list(data.responses.values())[0].get("Id") == "1" + + +def test_normalize_path_empty_or_invalid(): + api = "redfish/v1" + assert ec._normalize_path("", api) == "" + assert ec._normalize_path(None, api) == "" # type: ignore[arg-type] + assert ec._normalize_path(" ", api) == "" + + +def test_normalize_path_relative_path(): + api = "redfish/v1" + assert ec._normalize_path("/redfish/v1", api) == "/redfish/v1" + assert ec._normalize_path("/redfish/v1/", api) == "/redfish/v1" + assert ec._normalize_path("redfish/v1/Systems", api) == "/redfish/v1/Systems" + assert ec._normalize_path(" /redfish/v1/Chassis ", api) == "/redfish/v1/Chassis" + + +def test_normalize_path_full_url(): + api = "redfish/v1" + assert ec._normalize_path("https://host/redfish/v1/Systems", api) == "/redfish/v1/Systems" + assert ec._normalize_path("http://bmc/redfish/v1", api) == "/redfish/v1" + + +def test_normalize_path_outside_api_root(): + api = "redfish/v1" + assert ec._normalize_path("/other/root", api) == "" + assert ec._normalize_path("https://host/other/path", api) == "" + + +def test_extract_odata_ids_empty(): + assert ec._extract_odata_ids({}) == [] + assert ec._extract_odata_ids([]) == [] + assert ec._extract_odata_ids("x") == [] + + +def test_extract_odata_ids_single(): + assert ec._extract_odata_ids({"@odata.id": "/redfish/v1"}) == ["/redfish/v1"] + assert ec._extract_odata_ids({"@odata.id": "https://host/redfish/v1"}) == [ + "https://host/redfish/v1" + ] + + +def test_extract_odata_ids_members(): + body = { + "Members": [ + {"@odata.id": "/redfish/v1/Systems/1"}, + {"@odata.id": "/redfish/v1/Systems/2"}, + ] + } + assert set(ec._extract_odata_ids(body)) == {"/redfish/v1/Systems/1", "/redfish/v1/Systems/2"} + + +def test_extract_odata_ids_nested_and_members(): + body = { + "@odata.id": "/redfish/v1", + "Systems": {"@odata.id": "/redfish/v1/Systems", "Members": []}, + "Chassis": { + "Members": [{"@odata.id": "/redfish/v1/Chassis/1"}], + }, + } + ids = ec._extract_odata_ids(body) + assert "/redfish/v1" in ids + assert "/redfish/v1/Systems" in ids + assert "/redfish/v1/Chassis/1" in ids + + +def test_uris_from_args_none(): + assert ec._uris_from_args(None) == [] + + +def test_uris_from_args_empty(): + assert ec._uris_from_args(RedfishEndpointCollectorArgs(uris=[])) == [] + + +def test_uris_from_args_with_uris(): + args = RedfishEndpointCollectorArgs(uris=["/redfish/v1", "/redfish/v1/Systems"]) + assert ec._uris_from_args(args) == ["/redfish/v1", "/redfish/v1/Systems"] + + +def test_fetch_one_calls_run_get(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", success=True, data={}, status_code=200 + ) + out = ec._fetch_one(conn, "/redfish/v1") + conn.run_get.assert_called_once_with("/redfish/v1") + assert out.success is True + assert out.path == "/redfish/v1" + + +def test_discover_tree_single_root(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Name": "Root"}, + status_code=200, + ) + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=2, max_endpoints=0 + ) + assert paths == ["/redfish/v1"] + assert list(responses.keys()) == ["/redfish/v1"] + assert responses["/redfish/v1"]["Name"] == "Root" + assert len(results) == 1 + conn.run_get.assert_called_once_with("/redfish/v1") + + +def test_discover_tree_follows_links(): + conn = MagicMock() + root_data = { + "@odata.id": "/redfish/v1", + "Systems": {"@odata.id": "/redfish/v1/Systems"}, + } + systems_data = { + "@odata.id": "/redfish/v1/Systems", + "Members": [{"@odata.id": "/redfish/v1/Systems/1"}], + } + system1_data = {"@odata.id": "/redfish/v1/Systems/1", "Id": "1"} + + def run_get(path): + if path == "/redfish/v1": + return RedfishGetResult(path=path, success=True, data=root_data, status_code=200) + if path == "/redfish/v1/Systems": + return RedfishGetResult(path=path, success=True, data=systems_data, status_code=200) + if path == "/redfish/v1/Systems/1": + return RedfishGetResult(path=path, success=True, data=system1_data, status_code=200) + return RedfishGetResult(path=path, success=False, error="Not Found", status_code=404) + + conn.run_get.side_effect = run_get + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=3, max_endpoints=0 + ) + assert "/redfish/v1" in paths + assert "/redfish/v1/Systems" in paths + assert "/redfish/v1/Systems/1" in paths + assert responses["/redfish/v1"]["@odata.id"] == "/redfish/v1" + assert responses["/redfish/v1/Systems"]["@odata.id"] == "/redfish/v1/Systems" + assert responses["/redfish/v1/Systems/1"]["Id"] == "1" + assert len(results) >= 3 + + +def test_discover_tree_respects_max_depth(): + conn = MagicMock() + root_data = {"@odata.id": "/redfish/v1", "Systems": {"@odata.id": "/redfish/v1/Systems"}} + systems_data = { + "@odata.id": "/redfish/v1/Systems", + "Members": [{"@odata.id": "/redfish/v1/Systems/1"}], + } + + def run_get(path): + if path == "/redfish/v1": + return RedfishGetResult(path=path, success=True, data=root_data, status_code=200) + if path == "/redfish/v1/Systems": + return RedfishGetResult(path=path, success=True, data=systems_data, status_code=200) + return RedfishGetResult(path=path, success=True, data={}, status_code=200) + + conn.run_get.side_effect = run_get + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=1, max_endpoints=0 + ) + assert "/redfish/v1" in paths + assert "/redfish/v1/Systems" not in paths + assert len(responses) == 1 + + +def test_discover_tree_respects_max_endpoints(): + conn = MagicMock() + conn.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Systems": {"@odata.id": "/redfish/v1/Systems"}}, + status_code=200, + ) + paths, responses, results = ec._discover_tree( + conn, api_root="redfish/v1", max_depth=5, max_endpoints=1 + ) + assert len(paths) == 1 + assert len(responses) == 1 + conn.run_get.assert_called_once() + + +def test_collect_data_discover_tree_success(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.api_root = "redfish/v1" + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", + success=True, + data={"@odata.id": "/redfish/v1", "Name": "Root"}, + status_code=200, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(discover_tree=True) + ) + assert result.status == ExecutionStatus.OK + assert result.message == "Collected 1 Redfish endpoint(s) from tree" + assert data is not None + assert "/redfish/v1" in data.responses + assert data.responses["/redfish/v1"]["Name"] == "Root" + + +def test_collect_data_discover_tree_no_responses(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.api_root = "redfish/v1" + redfish_conn_mock.run_get.return_value = RedfishGetResult( + path="/redfish/v1", success=False, error="Connection refused", status_code=None + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs(discover_tree=True) + ) + assert result.status == ExecutionStatus.ERROR + assert result.message.startswith("No Redfish endpoints discovered from tree") + assert data is None + + +def test_collect_data_concurrent_two_uris(redfish_endpoint_collector, redfish_conn_mock): + redfish_conn_mock.copy.return_value = redfish_conn_mock + call_count = 0 + + def run_get(path): + nonlocal call_count + call_count += 1 + return RedfishGetResult( + path=path if path.startswith("/") else "/" + path, + success=True, + data={"path": path}, + status_code=200, + ) + + redfish_conn_mock.run_get.side_effect = run_get + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs( + uris=["/redfish/v1", "/redfish/v1/Systems"], max_workers=2 + ) + ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.responses) == 2 + assert "/redfish/v1" in data.responses or "redfish/v1" in data.responses + assert any("Systems" in k for k in data.responses) + assert redfish_conn_mock.copy.called diff --git a/test/unit/plugin/test_regex_search_analyzer.py b/test/unit/plugin/test_regex_search_analyzer.py new file mode 100644 index 00000000..ac018ee1 --- /dev/null +++ b/test/unit/plugin/test_regex_search_analyzer.py @@ -0,0 +1,190 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import logging +import os +import tempfile + +from nodescraper.enums.executionstatus import ExecutionStatus +from nodescraper.plugins.regex_search.analyzer_args import RegexSearchAnalyzerArgs +from nodescraper.plugins.regex_search.regex_search_analyzer import RegexSearchAnalyzer +from nodescraper.plugins.regex_search.regex_search_data import RegexSearchData +from nodescraper.plugins.regex_search.regex_search_plugin import RegexSearchPlugin + +EXPECTED_MISSING_ANALYSIS_MSG = "Analysis args need to be provided for the analyzer to run" + + +def test_regex_search_data_from_file(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, encoding="utf-8") as f: + f.write("alpha\nbeta ERROR gamma\n") + path = f.name + try: + data = RegexSearchData.import_model(path) + assert "ERROR" in data.content + assert os.path.basename(path) in data.files + assert data.data_root == os.path.dirname(path) + finally: + os.unlink(path) + + +def test_regex_search_data_from_directory(): + with tempfile.TemporaryDirectory() as tmp: + with open(f"{tmp}/a.txt", "w", encoding="utf-8") as f: + f.write("one") + with open(f"{tmp}/b.txt", "w", encoding="utf-8") as f: + f.write("two") + data = RegexSearchData.import_model(tmp) + assert data.data_root == os.path.abspath(tmp) + assert set(data.files.keys()) == {"a.txt", "b.txt"} + assert data.files["a.txt"] == "one" + assert data.files["b.txt"] == "two" + assert "===== a.txt =====" in data.content + assert "===== b.txt =====" in data.content + + +def test_regex_search_analyzer_match(system_info): + data = RegexSearchData(content="line1\nFATAL: boom\nline3") + analyzer = RegexSearchAnalyzer(system_info=system_info) + args = RegexSearchAnalyzerArgs( + error_regex=[{"regex": r"FATAL:.*", "message": "fatal seen"}], + ) + result = analyzer.analyze_data(data, args) + assert result.status == ExecutionStatus.ERROR + assert "task detected errors" in result.message + assert "fatal seen" in result.message + assert len(result.events) == 1 + assert result.events[0].description == "fatal seen" + + +def test_regex_search_analyzer_missing_args(system_info): + data = RegexSearchData(content="x") + analyzer = RegexSearchAnalyzer(system_info=system_info) + result = analyzer.analyze_data(data, None) + assert result.status == ExecutionStatus.NOT_RAN + assert result.message == EXPECTED_MISSING_ANALYSIS_MSG + + result = analyzer.analyze_data(data, RegexSearchAnalyzerArgs(error_regex=None)) + assert result.status == ExecutionStatus.NOT_RAN + assert result.message == EXPECTED_MISSING_ANALYSIS_MSG + + result = analyzer.analyze_data(data, RegexSearchAnalyzerArgs(error_regex=[])) + assert result.status == ExecutionStatus.NOT_RAN + assert result.message == EXPECTED_MISSING_ANALYSIS_MSG + + +def test_regex_search_plugin_missing_error_regex_not_ran_and_warning( + system_info, logger, caplog, tmp_path +): + log_file = tmp_path / "sample.log" + log_file.write_text("line\n", encoding="utf-8") + plugin = RegexSearchPlugin(system_info=system_info, logger=logger) + with caplog.at_level(logging.WARNING, logger=logger.name): + out = plugin.run( + collection=False, + analysis=True, + data=str(log_file), + analysis_args=None, + ) + assert out.result_data.analysis_result.status == ExecutionStatus.NOT_RAN + assert out.result_data.analysis_result.message == EXPECTED_MISSING_ANALYSIS_MSG + assert any( + "analysis args need to be provided" in r.getMessage().lower() for r in caplog.records + ) + + +def test_regex_search_plugin_empty_analysis_args_dict_not_ran(system_info, logger, tmp_path): + log_file = tmp_path / "sample.log" + log_file.write_text("line\n", encoding="utf-8") + plugin = RegexSearchPlugin(system_info=system_info, logger=logger) + out = plugin.run( + collection=False, + analysis=True, + data=str(log_file), + analysis_args={}, + ) + assert out.result_data.analysis_result.status == ExecutionStatus.NOT_RAN + assert out.result_data.analysis_result.message == EXPECTED_MISSING_ANALYSIS_MSG + + +def test_regex_search_plugin_no_data_warns_and_data_message(system_info, logger, caplog): + plugin = RegexSearchPlugin(system_info=system_info, logger=logger) + with caplog.at_level(logging.WARNING, logger=logger.name): + out = plugin.run( + collection=False, + analysis=True, + data=None, + analysis_args=None, + ) + assert out.result_data.analysis_result.status == ExecutionStatus.NOT_RAN + assert "No data available to analyze for RegexSearchPlugin" in ( + out.result_data.analysis_result.message + ) + assert any( + "analysis args need to be provided" in r.getMessage().lower() for r in caplog.records + ) + + +def test_regex_search_plugin_analyzer_only(system_info, logger): + with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, encoding="utf-8") as f: + f.write("match_me_here\n") + path = f.name + try: + plugin = RegexSearchPlugin(system_info=system_info, logger=logger) + out = plugin.run( + collection=False, + analysis=True, + data=path, + analysis_args={ + "error_regex": [{"regex": r"match_me_here", "message": "found"}], + }, + ) + assert out.status == ExecutionStatus.ERROR + assert "Analysis error:" in out.message + assert "found" in out.message + assert out.result_data.analysis_result.status == ExecutionStatus.ERROR + assert len(out.result_data.analysis_result.events) == 1 + desc = out.result_data.analysis_result.events[0].description + assert "found" in desc + assert "[file:" in desc + assert path.replace("\\", "/") in desc.replace("\\", "/") + finally: + os.unlink(path) + + +def test_regex_search_multi_file_event_paths(system_info): + with tempfile.TemporaryDirectory() as tmp: + open(os.path.join(tmp, "clean.log"), "w", encoding="utf-8").write("ok\n") + open(os.path.join(tmp, "bad.log"), "w", encoding="utf-8").write("ERROR: boom\n") + data = RegexSearchData.import_model(tmp) + analyzer = RegexSearchAnalyzer(system_info=system_info) + args = RegexSearchAnalyzerArgs( + error_regex=[{"regex": r"ERROR[: ].*", "message": "err line"}], + ) + result = analyzer.analyze_data(data, args) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 + assert "err line" in result.events[0].description + assert "[file:" in result.events[0].description + assert "bad.log" in result.events[0].description