Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
---
ancestor: null
releases:
0.1.0:
Expand Down
2 changes: 1 addition & 1 deletion roles/ceems_exporter/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ceems_exporter_enabled_collectors: []
ceems_exporter_disabled_collectors: []
ceems_exporter_create_unique_jobids: false
ceems_exporter_slurm_job_props_dir: ""
ceems_exporter_gpu_type: nvidia
ceems_exporter_gpu_type: ""
ceems_exporter_gpu_job_map_dir: ""
ceems_exporter_emissions_country_code: FR

Expand Down
9 changes: 3 additions & 6 deletions roles/ceems_exporter/meta/argument_specs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,12 @@ argument_specs:
ceems_exporter_gpu_type:
description:
- GPU type.
- Check the L(example scripts,https://github.com/mahendrapaipuri/ceems_monitoring/tree/main/etc/slurm/epilog.d).
default: nvidia
choices:
- nvidia
- amd
- Currently, nVIDIA and AMD GPUs are supported.
default: ""
ceems_exporter_gpu_job_map_dir:
description:
- Directory where files containing mapping of SLURM job ID to GPU ordinals are created by Epilog scripts.
- Currently, nVIDIA and AMD GPUs are supported.
- Check the L(example scripts,https://github.com/mahendrapaipuri/ceems_monitoring/tree/main/etc/slurm/epilog.d).
default: ""
ceems_exporter_emissions_country_code:
description: ISO 3166-1 alpha-3 Country code to export emission data.
Expand Down
8 changes: 5 additions & 3 deletions roles/ceems_exporter/molecule/alternative/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ provisioner:
ceems_exporter_basic_auth_users:
randomuser: examplepassword
go_arch: amd64
ceems_exporter_slurm_job_props_dir: /run/slurmjobprops
ceems_exporter_gpu_job_map_dir: /run/gpujobmap
ceems_exporter_ipmi_dcmi_cmd: ipmi-dcmi
ceems_exporter_enabled_collectors:
- emissions
ceems_exporter_disabled_collectors:
- slurm
ceems_exporter_ipmi_dcmi_cmd: "sudo ipmi-dcmi"
ceems_exporter_env_vars:
EMAPS_API_TOKEN: foo
9 changes: 0 additions & 9 deletions roles/ceems_exporter/molecule/alternative/prepare.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,3 @@
dest: "{{ ceems_exporter_tls_server_config.cert_file }}"
- src: /tmp/tls.key
dest: "{{ ceems_exporter_tls_server_config.key_file }}"

- name: Create slurm job props and GPU job map directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
mode: "0755"
loop:
- "{{ ceems_exporter_slurm_job_props_dir }}"
- "{{ ceems_exporter_gpu_job_map_dir }}"
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ def test_directories(host, dir):
assert d.exists


@pytest.mark.parametrize("file", [
"/etc/sudoers.d/allow-ipmi-dcmi",
])
def test_files(host, file):
f = host.file(file)
assert f.exists


def test_service(host):
s = host.service("ceems_exporter")
try:
Expand All @@ -35,10 +43,7 @@ def test_systemd_properties(host):
p = s.systemd_properties
assert p.get("ProtectHome") == "yes"
assert p.get("Environment") == "EMAPS_API_TOKEN=foo"
# Seems like this test will fail on centos 7
if host.system_info.distribution != "centos" and host.system_info.release != "7":
assert p.get("AmbientCapabilities") == "cap_setgid cap_setuid"
assert p.get("CapabilityBoundingSet") == "cap_setgid cap_setuid"
assert p.get("AmbientCapabilities") in [None, "", "0", "False", False, "No", "no"]


@pytest.mark.parametrize("socket", [
Expand Down
3 changes: 2 additions & 1 deletion roles/ceems_exporter/tasks/configure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
user: "{{ ceems_exporter_system_user }}"
commands: /usr/sbin/ipmi-dcmi
when:
- '"ipmi_dcmi" in ceems_exporter_disabled_collectors'
- 'not "ipmi_dcmi" in ceems_exporter_disabled_collectors'
- '"sudo" in ceems_exporter_ipmi_dcmi_cmd'
- ceems_exporter_system_user != "root"

- name: Allow ceems_exporter port in SELinux on RedHat OS family
Expand Down
10 changes: 6 additions & 4 deletions roles/ceems_exporter/templates/ceems_exporter.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,19 @@ StartLimitInterval=0
{% endfor %}
ProtectHome={{ ns.protect_home }}

{% if "sudo" not in ceems_exporter_ipmi_dcmi_cmd and
"ipmi_dcmi" not in ceems_exporter_disabled_collectors and
{% if "ipmi_dcmi" not in ceems_exporter_disabled_collectors and
"sudo" not in ceems_exporter_ipmi_dcmi_cmd and
ceems_exporter_system_user != "root" %}
{% set _ = ns.caps.extend(['CAP_SETUID', 'CAP_SETGID']) %}
{% endif %}
{% if ceems_exporter_create_unique_jobids and
"slurm" not in ceems_exporter_disabled_collectors and

{% if "slurm" not in ceems_exporter_disabled_collectors and
ceems_exporter_create_unique_jobids and
not ceems_exporter_slurm_job_props_dir and
ceems_exporter_system_user != "root" %}
{% set _ = ns.caps.extend(['CAP_SYS_PTRACE', 'CAP_DAC_READ_SEARCH']) %}
{% endif %}

{% if ceems_exporter_gpu_type and
not ceems_exporter_gpu_job_map_dir and
ceems_exporter_system_user != "root" %}
Expand Down
2 changes: 1 addition & 1 deletion roles/nvidia_dcgm_exporter/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
dcgm_exporter_version: 3.3.3-3.3.1
golang_version: 1.21.5

dcgm_exporter_binary_local_dir: ""

dcgm_exporter_git_url: https://github.com/NVIDIA/dcgm-exporter.git
Expand Down
3 changes: 0 additions & 3 deletions roles/nvidia_dcgm_exporter/meta/argument_specs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ argument_specs:
dcgm_exporter_version:
description: DCGM exporter package version. Also accepts latest as parameter.
default: 3.3.0-3.2.0
golang_version:
description: Go version used to compile dcgm_exporter.
default: 1.21.5
dcgm_exporter_skip_install:
description: DCGM exporter installation tasks gets skipped when set to true.
type: bool
Expand Down
4 changes: 2 additions & 2 deletions roles/nvidia_dcgm_exporter/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
run_once: true
ansible.builtin.get_url:
url: "{{ _golang_download_url }}"
dest: /tmp/golang-{{ golang_version }}.linux-{{ go_arch }}.tar.gz
dest: /tmp/golang-{{ _golang_version }}.linux-{{ go_arch }}.tar.gz
mode: "0644"
register: _download_golang
until: _download_golang is succeeded
Expand All @@ -42,7 +42,7 @@
become: false
run_once: true
ansible.builtin.unarchive:
src: /tmp/golang-{{ golang_version }}.linux-{{ go_arch }}.tar.gz
src: /tmp/golang-{{ _golang_version }}.linux-{{ go_arch }}.tar.gz
dest: /tmp
creates: /tmp/go
delegate_to: localhost
Expand Down
4 changes: 3 additions & 1 deletion roles/nvidia_dcgm_exporter/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ go_arch_map:
armv6l: armv6

go_arch: "{{ go_arch_map[ansible_architecture] | default(ansible_architecture) }}"

_nvidia_dcgm_exporter_repo: NVIDIA/dcgm-exporter
_github_api_headers: "{{ {'GITHUB_TOKEN': lookup('ansible.builtin.env', 'GITHUB_TOKEN')} if (lookup('ansible.builtin.env', 'GITHUB_TOKEN')) else {} }}"
_golang_download_url: https://go.dev/dl/go{{ golang_version }}.linux-{{ go_arch }}.tar.gz
_golang_version: 1.21.5
_golang_download_url: https://go.dev/dl/go{{ _golang_version }}.linux-{{ go_arch }}.tar.gz