Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ class DeviceEnumerationCollector(InBandDataCollector[DeviceEnumerationDataModel,

DATA_MODEL = DeviceEnumerationDataModel

CMD_GPU_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'VGA\\|Display\\|3D' | wc -l"
CMD_GPU_COUNT_LINUX = (
"lspci -d {vendorid_ep}: | grep -iE "
"'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
"grep -vi 'Virtual Function' | wc -l"
)
CMD_VF_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l"
CMD_LSCPU_LINUX = "lscpu"
CMD_LSHW_LINUX = "lshw"
Expand Down
28 changes: 26 additions & 2 deletions nodescraper/plugins/inband/pcie/pcie_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@

T_CAP = TypeVar("T_CAP", bound=PcieCapStructure)

_AMD_PCIE_BRIDGE_DEVICE_IDS = frozenset({0x1500, 0x1501})
_PCI_BASE_CLASS_BRIDGE = 0x06


class PcieAnalyzerInputModel(BaseModel):
"""
Expand Down Expand Up @@ -870,6 +873,20 @@ def filter_pcie_data_by_device_id(
new_cfg_space_dict[bdf] = pcie_data
return new_cfg_space_dict

@staticmethod
def _is_amd_gpu_pcie_endpoint(cfg_space: PcieCfgSpace, vendorid_ep: int) -> bool:
"""True if this config space is an AMD GPU/accelerator endpoint, not a bridge."""
t0 = cfg_space.type_0_configuration
if t0.vendor_id.val != vendorid_ep:
return False
device_id = t0.device_id.val
if device_id in _AMD_PCIE_BRIDGE_DEVICE_IDS:
return False
base_class = t0.class_code.val
if base_class == _PCI_BASE_CLASS_BRIDGE:
return False
return True

def check_gpu_count(
self,
pcie_data: PcieDataModel,
Expand All @@ -888,10 +905,15 @@ def check_gpu_count(
return

gpu_count_from_pcie = 0
bridge_count = 0
for cfg_space in pcie_data.pcie_cfg_space.values():
vendor_id = cfg_space.type_0_configuration.vendor_id.val
if vendor_id == self.system_info.vendorid_ep:
t0 = cfg_space.type_0_configuration
if t0.vendor_id.val != self.system_info.vendorid_ep:
continue
if self._is_amd_gpu_pcie_endpoint(cfg_space, self.system_info.vendorid_ep):
gpu_count_from_pcie += 1
else:
bridge_count += 1

if gpu_count_from_pcie != expected_gpu_count:
self._log_event(
Expand All @@ -900,6 +922,7 @@ def check_gpu_count(
priority=EventPriority.ERROR,
data={
"gpu_count_from_pcie": gpu_count_from_pcie,
"amd_pcie_bridge_count_excluded": bridge_count,
"expected_gpu_count": expected_gpu_count,
},
)
Expand All @@ -910,6 +933,7 @@ def check_gpu_count(
priority=EventPriority.INFO,
data={
"gpu_count": gpu_count_from_pcie,
"amd_pcie_bridge_count_excluded": bridge_count,
},
)

Expand Down
12 changes: 10 additions & 2 deletions test/unit/plugin/test_device_enumeration_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ def test_collect_linux(system_info, device_enumeration_collector):
exit_code=0,
stdout="8",
stderr="",
command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l",
command=(
"lspci -d 1002: | grep -iE "
"'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
"grep -vi 'Virtual Function' | wc -l"
),
),
MagicMock(
exit_code=0,
Expand Down Expand Up @@ -142,7 +146,11 @@ def test_collect_error(system_info, device_enumeration_collector):
exit_code=1,
stdout="some output",
stderr="command failed",
command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l",
command=(
"lspci -d 1002: | grep -iE "
"'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
"grep -vi 'Virtual Function' | wc -l"
),
),
MagicMock(
exit_code=1,
Expand Down
Loading