From d103897c7bec8f340d3ca2b78177f414c81f8dcc Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Fri, 23 Feb 2024 18:43:13 +0100 Subject: [PATCH 1/2] feat: Support RDMA stats in exporter * Update unit tests and e2e test fixtures Signed-off-by: Mahendra Paipuri --- .../output/e2e-test-cgroupsv1-output.txt | 12 ++ .../e2e-test-cgroupsv2-all-metrics-output.txt | 12 ++ ...e2e-test-cgroupsv2-amd-ipmitool-output.txt | 12 ++ .../e2e-test-cgroupsv2-nogpu-output.txt | 12 ++ .../e2e-test-cgroupsv2-procfs-output.txt | 12 ++ pkg/collector/fixtures/sys.ttar | 119 ++++++++++++++++++ pkg/collector/slurm.go | 63 +++++++++- pkg/collector/slurm_test.go | 26 ++-- 8 files changed, 257 insertions(+), 11 deletions(-) diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt index 8289e4cb..a6a4af0b 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt @@ -93,6 +93,18 @@ ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.0194048e+07 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.0194048e+07 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.0194048e+07 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3 diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt index 9440ca9c..0832761d 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -118,6 +118,18 @@ ceems_slurm_job_memsw_total_bytes{hostname="",manager="slurm",project="testacc3" ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 0 ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 0 ceems_slurm_job_memsw_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 0 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3 diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt index af665f25..23738fb0 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-ipmitool-output.txt @@ -93,6 +93,18 @@ ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3 diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt index e6e4a1cf..1160b643 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt @@ -87,6 +87,18 @@ ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3 diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt index cd03a0cb..a104f508 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt @@ -93,6 +93,18 @@ ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 4.111491072e+09 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="77caf800-acd0-1fd2-7211-644e46814fc1"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="018ce2fe-b3f9-632a-7507-0e01c2687de5"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3 diff --git a/pkg/collector/fixtures/sys.ttar b/pkg/collector/fixtures/sys.ttar index cabdc2cb..323a6a24 100644 --- a/pkg/collector/fixtures/sys.ttar +++ b/pkg/collector/fixtures/sys.ttar @@ -2543,6 +2543,101 @@ Lines: 5 9870 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/rdma +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/rdma/slurm +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/rdma/slurm/uid_1000 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/cgroup.clone_children +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/cgroup.procs +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/cgroup.sane_behavior +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/cgroup.clone_children +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/cgroup.procs +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/notify_on_release +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/rdma.current +Lines: 3 +hfi1_0 hca_handle=479 hca_object=340 +hfi1_1 hca_handle=1479 hca_object=1340 +hfi1_2 hca_handle=2479 hca_object=2340EOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/rdma.max +Lines: 3 +hfi1_0 hca_handle=max hca_object=max +hfi1_1 hca_handle=max hca_object=max +hfi1_2 hca_handle=max hca_object=maxEOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009248/tasks +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/cgroup.clone_children +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/cgroup.procs +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/notify_on_release +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/rdma.current +Lines: 1 +hfi1_0 hca_handle=289 hca_object=1000EOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/rdma.max +Lines: 1 +hfi1_0 hca_handle=max hca_object=maxEOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/job_1009249/tasks +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/notify_on_release +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/release_agent +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/rdma/slurm/uid_1000/tasks +Lines: 0 +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/fs/cgroup/system.slice Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -6303,6 +6398,20 @@ Lines: 1 max Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009249/rdma.current +Lines: 3 +hfi1_0 hca_handle=479 hca_object=340 +hfi1_1 hca_handle=1479 hca_object=1340 +hfi1_2 hca_handle=2479 hca_object=2340EOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009249/rdma.max +Lines: 3 +hfi1_0 hca_handle=max hca_object=max +hfi1_1 hca_handle=max hca_object=max +hfi1_2 hca_handle=max hca_object=maxEOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009249/step_3 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -9685,6 +9794,16 @@ Lines: 1 max Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009250/rdma.current +Lines: 1 +hfi1_0 hca_handle=289 hca_object=1000EOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009250/rdma.max +Lines: 1 +hfi1_0 hca_handle=max hca_object=maxEOF +Mode: 664 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/fs/cgroup/system.slice/slurmstepd.scope/job_1009250/step_3 Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index 6cecd835..3fd7c4ad 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -114,6 +114,8 @@ type CgroupMetric struct { memswTotal float64 memswFailCount float64 memoryPressure float64 + rdmaHCAHandles map[string]float64 + rdmaHCAObjects map[string]float64 jobuser string jobaccount string jobid string @@ -144,6 +146,8 @@ type slurmCollector struct { jobMemswTotal *prometheus.Desc jobMemswFailCount *prometheus.Desc jobMemoryPressure *prometheus.Desc + jobRDMAHCAHandles *prometheus.Desc + jobRDMAHCAObjects *prometheus.Desc jobGpuFlag *prometheus.Desc collectError *prometheus.Desc jobPropsCache sync.Map @@ -302,6 +306,18 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { []string{"manager", "hostname", "user", "project", "uuid"}, nil, ), + jobRDMAHCAHandles: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_rdma_hca_handles"), + "Current number of RDMA HCA handles", + []string{"manager", "hostname", "user", "project", "uuid", "device"}, + nil, + ), + jobRDMAHCAObjects: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_rdma_hca_objects"), + "Current number of RDMA HCA objects", + []string{"manager", "hostname", "user", "project", "uuid", "device"}, + nil, + ), jobGpuFlag: prometheus.NewDesc( prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "job_gpu_index_flag"), "Indicates running job on GPU, 1=job running", @@ -332,6 +348,7 @@ func subsystem() ([]cgroup1.Subsystem, error) { s := []cgroup1.Subsystem{ cgroup1.NewCpuacct(*cgroupfsPath), cgroup1.NewMemory(*cgroupfsPath), + cgroup1.NewRdma(*cgroupfsPath), } return s, nil } @@ -352,6 +369,8 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { if m.err { ch <- prometheus.MustNewConstMetric(c.collectError, prometheus.GaugeValue, 1, m.name) } + + // CPU stats ch <- prometheus.MustNewConstMetric(c.jobCPUUser, prometheus.GaugeValue, m.cpuUser, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobCPUSystem, prometheus.GaugeValue, m.cpuSystem, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) // ch <- prometheus.MustNewConstMetric(c.cpuTotal, prometheus.GaugeValue, m.cpuTotal, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) @@ -364,11 +383,15 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { } } ch <- prometheus.MustNewConstMetric(c.jobCPUs, prometheus.GaugeValue, float64(cpus), c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) + + // Memory stats ch <- prometheus.MustNewConstMetric(c.jobMemoryRSS, prometheus.GaugeValue, m.memoryRSS, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemoryCache, prometheus.GaugeValue, m.memoryCache, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemoryUsed, prometheus.GaugeValue, m.memoryUsed, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemoryTotal, prometheus.GaugeValue, m.memoryTotal, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemoryFailCount, prometheus.GaugeValue, m.memoryFailCount, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) + + // PSI stats. Push them only if they are available if *collectSwapMemoryStats { ch <- prometheus.MustNewConstMetric(c.jobMemswUsed, prometheus.GaugeValue, m.memswUsed, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemswTotal, prometheus.GaugeValue, m.memswTotal, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) @@ -378,6 +401,20 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { ch <- prometheus.MustNewConstMetric(c.jobCPUPressure, prometheus.GaugeValue, m.cpuPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) ch <- prometheus.MustNewConstMetric(c.jobMemoryPressure, prometheus.GaugeValue, m.memoryPressure, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid) } + + // RDMA stats + for device, handles := range m.rdmaHCAHandles { + if handles > 0 { + ch <- prometheus.MustNewConstMetric(c.jobRDMAHCAHandles, prometheus.GaugeValue, handles, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid, device) + } + } + for device, objects := range m.rdmaHCAHandles { + if objects > 0 { + ch <- prometheus.MustNewConstMetric(c.jobRDMAHCAObjects, prometheus.GaugeValue, objects, c.manager, c.hostname, m.jobuser, m.jobaccount, m.jobuuid, device) + } + } + + // GPU job mapping if len(c.gpuDevs) > 0 { for _, gpuOrdinal := range m.jobgpuordinals { var uuid string @@ -513,9 +550,11 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) { } else { cpusPath = fmt.Sprintf("%s/cpuset%s/cpuset.cpus", *cgroupfsPath, name) } + if !fileExists(cpusPath) { - return nil, nil + return nil, fmt.Errorf("cpuset file %s not found", cpusPath) } + cpusData, err := os.ReadFile(cpusPath) if err != nil { level.Error(c.logger).Log("msg", "Error reading cpuset", "cpuset", cpusPath, "err", err) @@ -523,7 +562,7 @@ func (c *slurmCollector) getCPUs(name string) ([]string, error) { } cpus, err := c.parseCPUSet(strings.TrimSuffix(string(cpusData), "\n")) if err != nil { - level.Error(c.logger).Log("msg", "Error parsing cpu set", "cpuset", cpusPath, "err", err) + level.Error(c.logger).Log("msg", "Error parsing cpuset", "cpuset", cpusPath, "err", err) return nil, err } return cpus, nil @@ -833,6 +872,16 @@ func (c *slurmCollector) getCgroupsV1Metrics(name string) (CgroupMetric, error) } } + // Get RDMA metrics if available + if stats.Rdma != nil { + metric.rdmaHCAHandles = make(map[string]float64) + metric.rdmaHCAObjects = make(map[string]float64) + for _, device := range stats.Rdma.Current { + metric.rdmaHCAHandles[device.Device] = float64(device.HcaHandles) + metric.rdmaHCAObjects[device.Device] = float64(device.HcaObjects) + } + } + // Get job Info c.getJobProperties(name, &metric, nil) return metric, nil @@ -906,6 +955,16 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) metric.memoryFailCount = float64(stats.MemoryEvents.Oom) } + // Get RDMA stats + if stats.Rdma != nil { + metric.rdmaHCAHandles = make(map[string]float64) + metric.rdmaHCAObjects = make(map[string]float64) + for _, device := range stats.Rdma.Current { + metric.rdmaHCAHandles[device.Device] = float64(device.HcaHandles) + metric.rdmaHCAObjects[device.Device] = float64(device.HcaObjects) + } + } + // Get job Info cgroupProcPids, err := ctrl.Procs(true) if err != nil { diff --git a/pkg/collector/slurm_test.go b/pkg/collector/slurm_test.go index 3daa85cc..2a03c3de 100644 --- a/pkg/collector/slurm_test.go +++ b/pkg/collector/slurm_test.go @@ -44,7 +44,7 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { } metrics, err := c.getJobsMetrics() expectedSlurmMetrics = CgroupMetric{ - name: "/system.slice/slurmstepd.scope/job_1009248", + name: "/system.slice/slurmstepd.scope/job_1009249", cpuUser: 60375.292848, cpuSystem: 115.777502, cpuTotal: 60491.070351, @@ -59,18 +59,20 @@ func TestCgroupsV2SlurmJobMetrics(t *testing.T) { memswTotal: 123456, memswFailCount: 0, memoryPressure: 0, - jobuser: "testusr", - jobaccount: "testacc", - jobid: "1009248", - jobuuid: "0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5", - jobgpuordinals: []string{"2", "3"}, + rdmaHCAHandles: map[string]float64{"hfi1_0": 479, "hfi1_1": 1479, "hfi1_2": 2479}, + rdmaHCAObjects: map[string]float64{"hfi1_0": 340, "hfi1_1": 1340, "hfi1_2": 2340}, + jobuser: "testusr2", + jobaccount: "testacc2", + jobid: "1009249", + jobuuid: "018ce2fe-b3f9-632a-7507-0e01c2687de5", + jobgpuordinals: []string{"0"}, err: false, } if err != nil { t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) } - if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { - t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) + if !reflect.DeepEqual(metrics["1009249"], expectedSlurmMetrics) { + t.Fatalf("Expected metrics data is %#v: \nGot %#v", expectedSlurmMetrics, metrics["1009249"]) } } @@ -109,6 +111,8 @@ func TestCgroupsV2SlurmJobMetricsWithProcFs(t *testing.T) { memswTotal: 123456, memswFailCount: 0, memoryPressure: 0, + rdmaHCAHandles: make(map[string]float64), + rdmaHCAObjects: make(map[string]float64), jobuser: "testusr", jobaccount: "testacc", jobid: "1009248", @@ -157,6 +161,8 @@ func TestCgroupsV2SlurmJobMetricsNoJobProps(t *testing.T) { memswTotal: 1.8446744073709552e+19, memswFailCount: 0, memoryPressure: 0, + rdmaHCAHandles: make(map[string]float64), + rdmaHCAObjects: make(map[string]float64), jobuser: "", jobaccount: "", jobid: "1009248", @@ -206,6 +212,8 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { memswTotal: 9.223372036854772e+18, memswFailCount: 0, memoryPressure: 0, + rdmaHCAHandles: map[string]float64{"hfi1_0": 479, "hfi1_1": 1479, "hfi1_2": 2479}, + rdmaHCAObjects: map[string]float64{"hfi1_0": 340, "hfi1_1": 1340, "hfi1_2": 2340}, jobuser: "testusr", jobaccount: "testacc", jobid: "1009248", @@ -217,6 +225,6 @@ func TestCgroupsV1SlurmJobMetrics(t *testing.T) { t.Fatalf("Cannot fetch data from getJobsMetrics function: %v ", err) } if !reflect.DeepEqual(metrics["1009248"], expectedSlurmMetrics) { - t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics["1009248"]) + t.Fatalf("Expected metrics data is %#v: \nGot %#v", expectedSlurmMetrics, metrics["1009248"]) } } From 664531b71de068bc1b32a2f5da67f74ea58c27b9 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri Date: Fri, 23 Feb 2024 18:53:29 +0100 Subject: [PATCH 2/2] fix: Correct scenario name in Makefile Signed-off-by: Mahendra Paipuri --- Makefile | 2 +- .../e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c0bc3e65..5216de95 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ ifeq ($(CGO_BUILD), 0) test-e2e-update: build pkg/collector/fixtures/sys/.unpacked pkg/collector/fixtures/proc/.unpacked @echo ">> updating end-to-end tests outputs" ./scripts/e2e-test.sh -s exporter-cgroups-v1 -u || true - ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmitutil -u || true + ./scripts/e2e-test.sh -s exporter-cgroups-v2-nvidia-ipmiutil -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-amd-ipmitool -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-nogpu -u || true ./scripts/e2e-test.sh -s exporter-cgroups-v2-procfs -u || true diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt index 92a199ed..0804de59 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-ipmiutil-output.txt @@ -93,6 +93,18 @@ ceems_slurm_job_memory_total_bytes{hostname="",manager="slurm",project="testacc3 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc",user="testusr",uuid="1009248"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 4.111491072e+09 ceems_slurm_job_memory_used_bytes{hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 4.111491072e+09 +# HELP ceems_slurm_job_rdma_hca_handles Current number of RDMA HCA handles +# TYPE ceems_slurm_job_rdma_hca_handles gauge +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 +ceems_slurm_job_rdma_hca_handles{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 +ceems_slurm_job_rdma_hca_handles{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 +# HELP ceems_slurm_job_rdma_hca_objects Current number of RDMA HCA objects +# TYPE ceems_slurm_job_rdma_hca_objects gauge +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_0",hostname="",manager="slurm",project="testacc3",user="testusr2",uuid="1009250"} 289 +ceems_slurm_job_rdma_hca_objects{device="hfi1_1",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 1479 +ceems_slurm_job_rdma_hca_objects{device="hfi1_2",hostname="",manager="slurm",project="testacc2",user="testusr2",uuid="1009249"} 2479 # HELP ceems_slurm_jobs Total number of jobs # TYPE ceems_slurm_jobs gauge ceems_slurm_jobs{hostname="",manager="slurm"} 3