Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service.
# This prolog is meant to be run by compute nodes.
# This prolog is meant to be run by compute nodes with exclusive jobs.

LOG_FILE_PATH="/var/log/parallelcluster/nvidia-imex-prolog.log"
SCONTROL_CMD="/opt/slurm/bin/scontrol"
Expand All @@ -11,8 +11,6 @@ IMEX_STOP_TIMEOUT=15
ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)"
IMEX_SERVICE="nvidia-imex"



function info() {
echo "$(date "+%Y-%m-%dT%H:%M:%S.%3N") [INFO] [PID:$$] [JOB:${SLURM_JOB_ID}] $1"
}
Expand Down Expand Up @@ -82,47 +80,6 @@ function get_compute_resource_name() {
echo "${_slurmd_node_name}" | sed -E "s/${_queue_name_prefix}(.+)-[0-9]+$/\1/"
}

function check_imex_needs_reload() {
local _expected_ips=$1
local _imex_config_file=$2

# First check if IMEX service is running
if ! systemctl is-active ${IMEX_SERVICE} &>/dev/null; then
info "IMEX service is not running, reload needed"
return 0 # Need reload
fi

# Get current IMEX status
local imex_status_output
if ! imex_status_output=$(timeout 15 /usr/bin/nvidia-imex-ctl -N -j -c "${_imex_config_file}" 2>/dev/null); then
info "Failed to get IMEX status, assuming reload needed"
return 0 # Need reload
fi

# Parse JSON to extract current IPs from IMEX status
local current_imex_ips
if ! current_imex_ips=$(echo "${imex_status_output}" | jq -r '.nodes | to_entries[].value.host' 2>/dev/null | sort | tr '\n' ' '); then
info "Failed to parse IMEX status JSON, assuming reload needed"
return 0 # Need reload
fi

# Convert expected IPs to sorted space-separated string
local expected_ips_sorted
expected_ips_sorted=$(echo "${_expected_ips}" | tr ',' '\n' | sort | tr '\n' ' ')

info "Current IMEX IPs: ${current_imex_ips}"
info "Expected IPs: ${expected_ips_sorted}"

# Compare IP lists
if [[ "${current_imex_ips}" = "${expected_ips_sorted}" ]]; then
info "IMEX service running with correct IPs, skipping reload"
return 1 # Skip reload
else
info "IMEX IPs mismatch, reload needed"
return 0 # Need reload
fi
}

function write_file() {
local _file=$1
local _content=$2
Expand Down Expand Up @@ -181,32 +138,7 @@ function reload_imex() {
# sed -i "s/SERVER_PORT.*/SERVER_PORT=${NEW_SERVER_PORT}/" "${IMEX_MAIN_CONFIG}"

info "Restarting IMEX"
if ! timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE}; then
error "IMEX service reload failed"
return 1
fi

return 0
}

function handle_imex_reload() {
local _ips_from_cr=$1
local _imex_main_config=$2
local _reload_reason=$3
local _skip_message=$4
local _reload_message=$5

info "${_reload_reason}"
if check_imex_needs_reload "${_ips_from_cr}" "${_imex_main_config}"; then
info "${_reload_message}"
if reload_imex; then
info "IMEX has been reloaded"
else
error "Failed to reload IMEX service"
fi
else
info "${_skip_message}"
fi
timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE}
}

function create_default_imex_channel() {
Expand Down Expand Up @@ -249,12 +181,10 @@ function create_default_imex_channel() {
info "IMEX Main Config: ${IMEX_MAIN_CONFIG}"
info "IMEX Nodes Config: ${IMEX_NODES_CONFIG}"

info "Checking IMEX nodes config ${IMEX_NODES_CONFIG}"
if write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"; then
handle_imex_reload "${IPS_FROM_CR}" "${IMEX_MAIN_CONFIG}" "IMEX nodes config updated, checking if reload is needed" "IMEX already configured correctly, skipping reload" "IMEX reload needed, restarting service"
else
handle_imex_reload "${IPS_FROM_CR}" "${IMEX_MAIN_CONFIG}" "IMEX nodes config unchanged, checking if reload is still needed" "IMEX config unchanged and service correctly configured, skipping reload" "IMEX reload needed despite unchanged config, restarting service"
fi
info "Updating IMEX nodes config ${IMEX_NODES_CONFIG}"
write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"

reload_imex

prolog_end

Expand Down
Loading