From 9845423355fa69be551229a9bda0c8d9ae1633d1 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Fri, 5 Sep 2025 15:57:00 +0200 Subject: [PATCH] fix: Allow GPU fetching to fail for libvirt collector * When GPUs are configured to use in passthrough mode, there is no gurantee that `nvidia-smi` command will return GPUs. In that case we should not block exporter from starting. In any case, we cannot fetch metrics of passed through GPUs so it should not block exporter. * Simulate the case in unit test Signed-off-by: Mahendra Paipuri --- pkg/collector/libvirt.go | 9 ++++++--- pkg/collector/libvirt_test.go | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/collector/libvirt.go b/pkg/collector/libvirt.go index 4e627f5a..9672cff3 100644 --- a/pkg/collector/libvirt.go +++ b/pkg/collector/libvirt.go @@ -235,10 +235,13 @@ func NewLibvirtCollector(logger *slog.Logger) (Collector, error) { err = gpuSMI.Discover() if err != nil { // If we failed to fetch GPUs that are from supported - // vendor, return with error + // vendor, DO NOT return with error. + // Seems like we can run into cases where hypervisors + // do not have GPU drivers installed when they use + // passthrough. In case we cannot get GPUs on the + // hypervisor so we should not block exporter from + // starting logger.Error("Error fetching GPU devices", "err", err) - - return nil, err } // Check if vGPU is activated on atleast one GPU diff --git a/pkg/collector/libvirt_test.go b/pkg/collector/libvirt_test.go index 4c056049..eeaeb818 100644 --- a/pkg/collector/libvirt_test.go +++ b/pkg/collector/libvirt_test.go @@ -30,7 +30,10 @@ func TestNewLibvirtCollector(t *testing.T) { "--collector.perf.hardware-events", "--collector.rdma.stats", "--collector.gpu.type", "nvidia", - "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", + // This is to simulate GPU device detection but + // fail to find GPU devices. The collector should + // initialise correctly in that case. + // "--collector.gpu.nvidia-smi-path", "testdata/nvidia-smi", "--collector.cgroups.force-version", "v2", }, )