From 6d7b7dabcc01d05f0dd765eecd7ac87af662b802 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Mon, 10 Nov 2025 17:32:30 -0600 Subject: [PATCH 01/10] update openzfs mounting logic --- .../base-config/mount_fsx_openzfs.sh | 68 +++++++++++++++---- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh index 4f068317d..ad866c011 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh @@ -76,21 +76,63 @@ install_nfs_client() # Mount the FSx OpenZFS file system mount_fs() { - # Create mount point directory if it doesn't exist - if [ ! -d "$OPENZFS_MOUNT_POINT" ]; then - mkdir -p "$OPENZFS_MOUNT_POINT" - fi + local max_attempts=5 + local attempt=1 + local delay=5 + + echo "[INFO] Ensuring $OPENZFS_MOUNT_POINT directory exists..." + ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT state=directory" || true + + echo "[INFO] Mounting FSx OpenZFS on $OPENZFS_MOUNT_POINT..." + + while (( attempt <= max_attempts )); do + echo "============================" + echo "[INFO] Attempt $attempt of $max_attempts" + echo "============================" + + echo "[STEP] Mounting FSx OpenZFS..." + if ! ansible localhost -b -m ansible.posix.mount -a \ + "path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"; then + echo "[WARN] Mount command failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[STEP] Verifying mountpoint..." + if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $OPENZFS_MOUNT_POINT"; then + echo "[WARN] Mountpoint verification failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi - retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible localhost -b -m ansible.posix.mount -a \"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted\"" + echo "[STEP] Triggering automount..." + ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1 || true + + echo "[STEP] Testing file access (touch)..." + if ! ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT/test_file state=touch"; then + echo "[WARN] Touch failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[STEP] Testing file access (delete)..." + if ! ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT/test_file state=absent"; then + echo "[WARN] Delete failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[SUCCESS] FSx OpenZFS mount succeeded on attempt $attempt" + return 0 + done + + echo "[ERROR] FSx OpenZFS mount failed after $max_attempts attempts" + return 1 } -# Verify mount was successful -verify_mount() +# Restart systemd daemon to ensure mount units are properly loaded +restart_daemon() { - if ! mountpoint -q "$OPENZFS_MOUNT_POINT"; then - echo "Failed to verify mount point $OPENZFS_MOUNT_POINT" - exit 1 - fi + ansible localhost -b -m ansible.builtin.systemd -a "daemon_reload=yes" + ansible localhost -b -m ansible.builtin.systemd -a "name=remote-fs.target state=restarted" + echo "Check status of OpenZFS automount..." + systemctl list-units | grep -i automount || true } main() @@ -99,8 +141,8 @@ main() echo "Using openzfs_mount_point: $OPENZFS_MOUNT_POINT" verify_parameters install_nfs_client - mount_fs - verify_mount + mount_fs || exit 1 + restart_daemon echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT" } From 7b679c711cbc4f509620162d8273e1fbd7aea169 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Tue, 11 Nov 2025 00:24:03 -0600 Subject: [PATCH 02/10] adding forceful creation of symlink for .ssh --- .../LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index b2c32af77..3b9a12919 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -14,10 +14,10 @@ if [ -d "$FSX_OZFS_DIR" ]; then elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." rm -rf "$FSX_OZFS_DIR/.ssh" - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" else echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" fi fi From 952886b22825a0f355045a3ac71389b6a782c0ae Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Tue, 11 Nov 2025 16:05:28 -0600 Subject: [PATCH 03/10] Added chown to the user for .ssh --- .../LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index 3b9a12919..5586acd1e 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -50,4 +50,5 @@ ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa.pub' ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/authorized_keys' owner=ubuntu group=ubuntu mode=0600" # Set permissions for the .ssh directory chmod 700 $FSX_DIR/.ssh -# Change ownership to the ubuntu user \ No newline at end of file +# Change ownership to the ubuntu user +chown -R ubuntu:ubuntu $FSX_DIR/.ssh \ No newline at end of file From d7d5aacb4ee91696111f4a9a550f8b19a5af8a95 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Tue, 11 Nov 2025 19:21:07 -0600 Subject: [PATCH 04/10] updating information for cluster user config based on openzfs present --- .../automate-smhp-slurm/automate-cluster-creation.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh b/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh index aae73093d..e029b0635 100755 --- a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh +++ b/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh @@ -995,7 +995,11 @@ configure_cluster_users() { read -p "Hit ENTER once step is complete: " echo -e "\n3. Change the Linux shell profile." - echo -e " It should have '/bin/bash -c 'export HOME=/fsx/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + if [[ "$ENABLE_FSX_OPENZFS" == "true" ]]; then + echo -e " It should have '/bin/bash -c 'export HOME=/home/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + else + echo -e " It should have '/bin/bash -c 'export HOME=/fsx/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + fi read -p "Hit ENTER once you've added this line in: " echo -e "\n${GREEN}✅ SSM Run As support configured successfully${NC}" From 57b68858e94a8f6d4d652a5263714ec0d823c43f Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Tue, 11 Nov 2025 19:23:05 -0600 Subject: [PATCH 05/10] updated to include users other than ubuntu --- .../base-config/utils/fsx_ubuntu.sh | 141 +++++++++++++----- .../base-config/utils/gen-keypair-ubuntu.sh | 124 +++++++++------ 2 files changed, 187 insertions(+), 78 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh index 6b6eeb850..23e3f728d 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh @@ -6,6 +6,7 @@ WAIT=10 FSX_OZFS_EXISTS=$1 FSX_OPENZFS_DNS_NAME="/home" FSX_L_DNS_NAME="/fsx" +SHARED_USER_FILE="shared_users.txt" # Function to check mount check_mount() @@ -36,6 +37,61 @@ wait_for_mount() done } +# Function to setup home directory for a user with OpenZFS +setup_user_home_openzfs() +{ + local username=$1 + + echo "Setting up OpenZFS home directory for user: $username" + + # Create user directory on OpenZFS + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/$username' state=directory owner=$username group=$username mode=0755" + + # Set home directory to /home/username + ansible localhost -b -m ansible.builtin.user -a "name=$username home='$FSX_OPENZFS_DNS_NAME/$username' move_home=yes" + echo "Home directory set to $FSX_OPENZFS_DNS_NAME/$username" + + # Maintain access to /fsx/username + if wait_for_mount "$FSX_L_DNS_NAME"; then + sudo mkdir -p "$FSX_L_DNS_NAME/$username" + sudo chown "$username:$username" "$FSX_L_DNS_NAME/$username" + else + echo "Warning: FSx Lustre mount not available, skipping $FSX_L_DNS_NAME/$username setup" + fi +} + +# Function to setup home directory for a user with FSx Lustre only +setup_user_home_fsx_lustre() +{ + local username=$1 + local fsx_home=$2 + + echo "Setting up FSx Lustre home directory for user: $username at $fsx_home" + + if [ -d "$fsx_home" ]; then + sudo usermod -d "$fsx_home" "$username" + elif [ -d "$FSX_L_DNS_NAME" ]; then + # Create the directory + sudo mkdir -p "$fsx_home" + sudo chown "$username:$username" "$fsx_home" + + # Try to change home directory with move + if ! sudo usermod -m -d "$fsx_home" "$username"; then + echo "Warning: Could not move home directory for $username. Setting home without moving files." + + # If user has existing home, copy contents + if [ -d "/home/$username" ]; then + sudo rsync -a "/home/$username/" "$fsx_home/" + fi + sudo chown -R "$username:$username" "$fsx_home" + + sudo usermod -d "$fsx_home" "$username" + else + echo "Home directory moved successfully to $fsx_home" + fi + fi +} + if [ -z "$FSX_OZFS_EXISTS" ]; then echo "Error: Missing parameter. Usage: $0 <1|0> (1 if OpenZFS exists, 0 otherwise)" exit 1 @@ -43,49 +99,62 @@ fi # Check if OpenZFS is mounted if [ $FSX_OZFS_EXISTS -eq 1 ]; then - echo "OpenZFS is mounted. Looping to ensure FSxOZFS is mounted." - + echo "OpenZFS is mounted. Setting up home directories on OpenZFS." + if wait_for_mount "$FSX_OPENZFS_DNS_NAME"; then - ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/ubuntu' state=directory owner=ubuntu group=ubuntu mode=0755" - - echo "OpenZFS is mounted at $FSX_OPENZFS_DNS_NAME" - # Set home directory to /home/ubuntu - ansible localhost -b -m ansible.builtin.user -a "name=ubuntu home='$FSX_OPENZFS_DNS_NAME/ubuntu' move_home=yes" - echo "Home directory set to $FSX_OPENZFS_DNS_NAME/ubuntu" - - # Maintain access to /fsx/ubuntu - if wait_for_mount "$FSX_L_DNS_NAME"; then - sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu" - sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" + # Setup ubuntu user first + echo "Setting up home directory for default ubuntu user..." + setup_user_home_openzfs "ubuntu" + + # Process additional users from shared_users.txt if it exists + if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + + while IFS="," read -r username uid home; do + # Skip empty lines + if [[ -z "$username" ]]; then + continue + fi + + echo "Processing home directory for user: $username" + setup_user_home_openzfs "$username" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" else - echo "Warning: FSx mount not available, skipping $FSX_L_DNS_NAME/ubuntu setup" + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi fi else - echo "OpenZFS is not mounted. Skipped OZFS check loop, and looping for FSxL only." - echo "Using FSxL file system as home..." - + echo "OpenZFS is not mounted. Using FSx Lustre file system as home..." + if ! wait_for_mount "$FSX_L_DNS_NAME"; then echo "Warning: FSx mount not available. Exiting." exit 1 fi - if [ -d "$FSX_L_DNS_NAME/ubuntu" ]; then - sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu - elif [ -d "$FSX_L_DNS_NAME" ]; then - # Create the directory (race condition: if it doesn't get detected) - sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu" - sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" - - # Try to change home directory with move (race condition) - if ! sudo usermod -m -d "$FSX_L_DNS_NAME/ubuntu" ubuntu; then - echo "Warning: Could not move home directory. Setting home without moving files." - - sudo rsync -a /home/ubuntu/ "$FSX_L_DNS_NAME/ubuntu/" - sudo chown -R ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" - - sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu - else - echo "Home directory moved successfully to $FSX_L_DNS_NAME/ubuntu" - fi + + # Setup ubuntu user first + echo "Setting up home directory for default ubuntu user..." + setup_user_home_fsx_lustre "ubuntu" "$FSX_L_DNS_NAME/ubuntu" + + # Process additional users from shared_users.txt if it exists + if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + + while IFS="," read -r username uid home; do + # Skip empty lines + if [[ -z "$username" ]]; then + continue + fi + + echo "Processing home directory for user: $username at $home" + setup_user_home_fsx_lustre "$username" "$home" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" + else + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi -fi \ No newline at end of file +fi + +echo "Home directory setup completed for all users" \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index 5586acd1e..c8495821d 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -2,53 +2,93 @@ set -exuo pipefail -FSX_DIR="/fsx/ubuntu" -FSX_OZFS_DIR="/home/ubuntu" +SHARED_USER_FILE="shared_users.txt" -mkdir -p $FSX_DIR/.ssh - -# Creating symlink between /fsx/ubuntu/.ssh and /home/ubuntu/.ssh -if [ -d "$FSX_OZFS_DIR" ]; then - if [ -L "$FSX_OZFS_DIR/.ssh" ]; then - echo "$FSX_OZFS_DIR/.ssh is already a symbolic link" - elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then - echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." - rm -rf "$FSX_OZFS_DIR/.ssh" - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" +# Function to setup SSH keys for a single user +setup_user_ssh() { + local username=$1 + local fsx_home=$2 + + echo "Setting up SSH keys for user: $username" + + local FSX_DIR="$fsx_home" + local FSX_OZFS_DIR="/home/$username" + + # Create .ssh directory on FSx + mkdir -p "$FSX_DIR/.ssh" + + # Creating symlink between /fsx/username/.ssh and /home/username/.ssh (if OpenZFS is mounted) + if [ -d "$FSX_OZFS_DIR" ]; then + if [ -L "$FSX_OZFS_DIR/.ssh" ]; then + echo "$FSX_OZFS_DIR/.ssh is already a symbolic link" + elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then + echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." + rm -rf "$FSX_OZFS_DIR/.ssh" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" + else + echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" + fi + fi + + cd "$FSX_DIR/.ssh" + + # Check if id_rsa exists + if [ ! -f id_rsa ]; then + GENERATE_KEYPAIR=1 else - echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" + GENERATE_KEYPAIR=0 + # Check if id_rsa.pub exists in authorized_keys + if ! grep -qF "$(cat id_rsa.pub)" authorized_keys 2>/dev/null; then + # If not, add the public key to authorized_keys + cat id_rsa.pub >> authorized_keys + fi fi -fi - -cd $FSX_DIR/.ssh - -# Check if id_rsa exists -if [ ! -f id_rsa ]; then - GENERATE_KEYPAIR=1 -else - GENERATE_KEYPAIR=0 - # Check if id_rsa.pub exists in authorized_keys - if ! grep -qF "$(cat id_rsa.pub)" authorized_keys 2>/dev/null; then - # If not, add the public key to authorized_keys + + if [[ $GENERATE_KEYPAIR == 1 ]]; then + echo "Generate a new keypair for $username..." + ssh-keygen -t rsa -b 4096 -q -f id_rsa -N "" 2>/dev/null || true cat id_rsa.pub >> authorized_keys + else + echo "Use existing keypair for $username..." fi -fi -if [[ $GENERATE_KEYPAIR == 1 ]]; then - echo Generate a new keypair... - ssh-keygen -t rsa -b 4096 -q -f id_rsa -N "" 2>/dev/null || true - cat id_rsa.pub >> authorized_keys + + # Set permissions for the ssh keypair + ansible localhost -m ansible.builtin.file -a "path=$FSX_DIR/.ssh/authorized_keys state=touch" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa' owner=$username group=$username mode=0600" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa.pub' owner=$username group=$username mode=0644" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/authorized_keys' owner=$username group=$username mode=0600" + + # Set permissions for the .ssh directory + chmod 700 "$FSX_DIR/.ssh" + + # Change ownership to the user + chown -R "$username:$username" "$FSX_DIR/.ssh" + + echo "SSH setup completed for user: $username" +} + +# Always setup ubuntu user first +echo "Setting up SSH keys for default ubuntu user..." +setup_user_ssh "ubuntu" "/fsx/ubuntu" + +# Process additional users from shared_users.txt if it exists +if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + + while IFS="," read -r username uid home; do + # Skip empty lines + if [[ -z "$username" ]]; then + continue + fi + + echo "Processing user: $username with home: $home" + setup_user_ssh "$username" "$home" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" else - echo Use existing keypair... + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi -# (diff: do this regardless of if new kp is generated to ensure consistent permissions) -# Set permissions for the ssh keypair -ansible localhost -m ansible.builtin.file -a "path=$FSX_DIR/.ssh/authorized_keys state=touch" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa' owner=ubuntu group=ubuntu mode=0600" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa.pub' owner=ubuntu group=ubuntu mode=0644" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/authorized_keys' owner=ubuntu group=ubuntu mode=0600" -# Set permissions for the .ssh directory -chmod 700 $FSX_DIR/.ssh -# Change ownership to the ubuntu user -chown -R ubuntu:ubuntu $FSX_DIR/.ssh \ No newline at end of file +echo "SSH key generation completed for all users" \ No newline at end of file From b743ec2aef5eb4d7c76f4133555fb0e530509dc7 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Wed, 12 Nov 2025 12:59:11 -0600 Subject: [PATCH 06/10] fixed relative path for shared_users.txt --- .../LifecycleScripts/base-config/utils/fsx_ubuntu.sh | 5 ++++- .../LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh index 23e3f728d..766ba1036 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh @@ -6,7 +6,10 @@ WAIT=10 FSX_OZFS_EXISTS=$1 FSX_OPENZFS_DNS_NAME="/home" FSX_L_DNS_NAME="/fsx" -SHARED_USER_FILE="shared_users.txt" + +# Look for shared_users.txt in parent directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt" # Function to check mount check_mount() diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index c8495821d..9322ab2e7 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -2,7 +2,9 @@ set -exuo pipefail -SHARED_USER_FILE="shared_users.txt" +# Look for shared_users.txt in parent directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt" # Function to setup SSH keys for a single user setup_user_ssh() { From cdcb63b172bbfb876ccb8dffcad5fc19a57d9c90 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Wed, 12 Nov 2025 14:41:34 -0600 Subject: [PATCH 07/10] Adding xargs to strip carriage returns --- .../base-config/utils/fsx_ubuntu.sh | 38 ++++++++++++++++--- .../base-config/utils/gen-keypair-ubuntu.sh | 21 ++++++++-- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh index 766ba1036..afbfd032d 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh @@ -112,14 +112,28 @@ if [ $FSX_OZFS_EXISTS -eq 1 ]; then # Process additional users from shared_users.txt if it exists if [[ -f $SHARED_USER_FILE ]]; then echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" while IFS="," read -r username uid home; do - # Skip empty lines + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace if [[ -z "$username" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up home + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping home setup" continue fi - echo "Processing home directory for user: $username" + echo "Processing home directory for user: '$username'" setup_user_home_openzfs "$username" done < "$SHARED_USER_FILE" @@ -143,14 +157,28 @@ else # Process additional users from shared_users.txt if it exists if [[ -f $SHARED_USER_FILE ]]; then echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" while IFS="," read -r username uid home; do - # Skip empty lines - if [[ -z "$username" ]]; then + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace + if [[ -z "$username" ]] || [[ -z "$home" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up home + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping home setup" continue fi - echo "Processing home directory for user: $username at $home" + echo "Processing home directory for user: '$username' at '$home'" setup_user_home_fsx_lustre "$username" "$home" done < "$SHARED_USER_FILE" diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index 9322ab2e7..4ea5e10ef 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -77,14 +77,29 @@ setup_user_ssh "ubuntu" "/fsx/ubuntu" # Process additional users from shared_users.txt if it exists if [[ -f $SHARED_USER_FILE ]]; then echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" + echo "---" while IFS="," read -r username uid home; do - # Skip empty lines - if [[ -z "$username" ]]; then + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace + if [[ -z "$username" ]] || [[ -z "$home" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up SSH + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping SSH setup" continue fi - echo "Processing user: $username with home: $home" + echo "Processing user: '$username' with uid: '$uid' and home: '$home'" setup_user_ssh "$username" "$home" done < "$SHARED_USER_FILE" From e2da5958df08c226576b88bab6cd4bc5d5242aa5 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Wed, 12 Nov 2025 15:36:12 -0600 Subject: [PATCH 08/10] fixed the ownership of symlink .ssh dir --- .../base-config/utils/gen-keypair-ubuntu.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index 4ea5e10ef..9a71cb752 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -23,13 +23,15 @@ setup_user_ssh() { if [ -d "$FSX_OZFS_DIR" ]; then if [ -L "$FSX_OZFS_DIR/.ssh" ]; then echo "$FSX_OZFS_DIR/.ssh is already a symbolic link" + # Ensure symlink ownership is correct + chown -h "$username:$username" "$FSX_OZFS_DIR/.ssh" elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." rm -rf "$FSX_OZFS_DIR/.ssh" - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes owner=$username group=$username" else echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes owner=$username group=$username" fi fi From 78a393665ad6cb201419564bde4422514d753518 Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Wed, 12 Nov 2025 15:41:26 -0600 Subject: [PATCH 09/10] Updated to include user flag for login --- .../5.sagemaker-hyperpod/easy-ssh.sh | 91 +++++++++++++++---- 1 file changed, 74 insertions(+), 17 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh b/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh index bd007129e..a32c26ff4 100755 --- a/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh +++ b/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh @@ -6,6 +6,7 @@ declare -a HELP=( "[-h|--help]" "[-c|--controller-group]" + "[-u|--user]" "[-r|--region]" "[-p|--profile]" "[-d|--dry-run]" @@ -14,6 +15,7 @@ declare -a HELP=( cluster_name="" node_group="controller-machine" +ssh_user="ubuntu" declare -a aws_cli_args=() DRY_RUN=0 @@ -25,12 +27,33 @@ parse_args() { -h|--help) echo "Access a HyperPod Slurm controller via ssh-over-ssm." echo "Usage: $(basename ${BASH_SOURCE[0]}) ${HELP[@]}" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -c, --controller-group Specify the controller group name (default: controller-machine)" + echo " -u, --user Specify the SSH user (default: ubuntu)" + echo " -r, --region Specify AWS region" + echo " -p, --profile Specify AWS profile" + echo " -d, --dry-run Show the SSM command without executing" + echo "" + echo "Examples:" + echo " $(basename ${BASH_SOURCE[0]}) ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -c login-group ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -u user1 ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -u user2 -r us-west-2 ml-cluster" + echo "" + echo "Note: For non-ubuntu users, ensure your IAM user has the SSMSessionRunAs tag" + echo " set to the desired OS username for passwordless login." exit 0 ;; -c|--controller-group) node_group="$2" shift 2 ;; + -u|--user) + ssh_user="$2" + shift 2 + ;; -r|--region) aws_cli_args+=(--region "$2") shift 2 @@ -55,12 +78,19 @@ parse_args() { [[ "$cluster_name" == "" ]] && { echo "Must define a cluster name" ; exit -1 ; } } -# Function to check if ml-cluster exists in ~/.ssh/config +# Function to check if cluster config exists in ~/.ssh/config check_ssh_config() { - if grep -wq "Host ${cluster_name}$" ~/.ssh/config; then - echo -e "${BLUE}1. Detected ${GREEN}${cluster_name}${BLUE} in ${GREEN}~/.ssh/config${BLUE}. Skipping adding...${NC}" + local ssh_host="${cluster_name}" + + # If user is not ubuntu, append username to host for unique config entry + if [[ "$ssh_user" != "ubuntu" ]]; then + ssh_host="${cluster_name}-${ssh_user}" + fi + + if grep -wq "Host ${ssh_host}$" ~/.ssh/config; then + echo -e "${BLUE}1. Detected ${GREEN}${ssh_host}${BLUE} in ${GREEN}~/.ssh/config${BLUE}. Skipping adding...${NC}" else - echo -e "${BLUE}Would you like to add ${GREEN}${cluster_name}${BLUE} to ~/.ssh/config (yes/no)?${NC}" + echo -e "${BLUE}Would you like to add ${GREEN}${ssh_host}${BLUE} to ~/.ssh/config (yes/no)?${NC}" read -p "> " ADD_CONFIG if [[ $ADD_CONFIG == "yes" ]]; then @@ -68,16 +98,19 @@ check_ssh_config() { mkdir -p ~/.ssh touch ~/.ssh/config fi - echo -e "${GREEN}✅ adding ml-cluster to ~/.ssh/config:${NC}" + echo -e "${GREEN}✅ adding ${ssh_host} to ~/.ssh/config:${NC}" cat <> ~/.ssh/config -Host ${cluster_name} - User ubuntu +Host ${ssh_host} + User ${ssh_user} ProxyCommand sh -c "aws ssm start-session ${aws_cli_args[@]} --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AWS-StartSSHSession --parameters 'portNumber=%p'" EOL else - echo -e "${GREEN}❌ skipping adding ml-cluster to ~/.ssh/config:" + echo -e "${GREEN}❌ skipping adding ${ssh_host} to ~/.ssh/config${NC}" fi fi + + # Store the ssh_host for later use + SSH_HOST="${ssh_host}" } escape_spaces() { @@ -88,21 +121,33 @@ escape_spaces() { # Function to add the user's SSH public key to the cluster add_keypair_to_cluster() { PUBLIC_KEY=$(cat ~/.ssh/id_rsa.pub) + + # Determine the authorized_keys path based on user and filesystem + # Check if OpenZFS is mounted (home directory would be /home/username) + local auth_keys_path="/fsx/${ssh_user}/.ssh/authorized_keys" + + # Try to detect if user home is on OpenZFS + local home_check=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="getent passwd ${ssh_user} | cut -d: -f6" 2>/dev/null || echo "") + + if echo "$home_check" | grep -q "^/home/${ssh_user}"; then + # User home is on OpenZFS, but .ssh is symlinked to /fsx + auth_keys_path="/fsx/${ssh_user}/.ssh/authorized_keys" + fi # Check if the fingerprint already exists in the cluster's authorized_keys - EXISTING_KEYS=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="cat /fsx/ubuntu/.ssh/authorized_keys") + EXISTING_KEYS=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="cat ${auth_keys_path}" 2>/dev/null || echo "") if echo "$EXISTING_KEYS" | grep -q "$PUBLIC_KEY"; then - echo -e "${BLUE}2. Detected SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} on the cluster. Skipping adding...${NC}" + echo -e "${BLUE}2. Detected SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} for user ${GREEN}${ssh_user}${BLUE} on the cluster. Skipping adding...${NC}" return else - echo -e "${BLUE}2. Do you want to add your SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} to the cluster (yes/no)?${NC}" + echo -e "${BLUE}2. Do you want to add your SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} to user ${GREEN}${ssh_user}${BLUE} on the cluster (yes/no)?${NC}" read -p "> " ADD_KEYPAIR if [[ $ADD_KEYPAIR == "yes" ]]; then echo "Adding ... ${PUBLIC_KEY}" - command="sed -i \$a$(escape_spaces "$PUBLIC_KEY") /fsx/ubuntu/.ssh/authorized_keys" + command="sed -i \$a$(escape_spaces "$PUBLIC_KEY") ${auth_keys_path}" aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="$command" - echo "✅ Your SSH public key ~/.ssh/id_rsa.pub has been added to the cluster." + echo "✅ Your SSH public key ~/.ssh/id_rsa.pub has been added to user ${ssh_user} on the cluster." else echo "❌ Skipping adding SSH public key to the cluster." fi @@ -141,18 +186,30 @@ fi echo -e "Cluster id: ${GREEN}${cluster_id}${NC}" echo -e "Instance id: ${GREEN}${instance_id}${NC}" echo -e "Node Group: ${GREEN}${node_group}${NC}" +echo -e "SSH User: ${GREEN}${ssh_user}${NC}" check_ssh_config add_keypair_to_cluster echo -e "\nNow you can run:\n" -echo -e "$ ${GREEN}ssh ${cluster_name}${NC}" +echo -e "$ ${GREEN}ssh ${SSH_HOST}${NC}" [[ DRY_RUN -eq 1 ]] && echo -e "\n${GREEN}aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id}${NC}\n" && exit 0 -# Start session as Ubuntu only if the SSM-SessionManagerRunShellAsUbuntu document exists. -if aws ssm describe-document "${aws_cli_args[@]}" --name SSM-SessionManagerRunShellAsUbuntu > /dev/null 2>&1; then - aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document SSM-SessionManagerRunShellAsUbuntu +# Determine which SSM document to use based on the user +if [[ "$ssh_user" == "ubuntu" ]]; then + # Start session as Ubuntu if the SSM-SessionManagerRunShellAsUbuntu document exists + if aws ssm describe-document "${aws_cli_args[@]}" --name SSM-SessionManagerRunShellAsUbuntu > /dev/null 2>&1; then + aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document SSM-SessionManagerRunShellAsUbuntu + else + aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} + fi else + # For non-ubuntu users, check if they have an IAM user with SSMSessionRunAs tag + echo -e "${BLUE}Connecting as user: ${GREEN}${ssh_user}${NC}" + echo -e "${YELLOW}Note: Make sure the IAM user has the SSMSessionRunAs tag set to '${ssh_user}' for passwordless login${NC}" + echo -e "${YELLOW}See: https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-getting-started-enable-ssh-connections.html${NC}" + + # Use standard SSM session - the SSMSessionRunAs tag on the IAM user will determine which OS user to use aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} fi From 36e0191381e9e9d4f5f352729cb2cd064acc14fd Mon Sep 17 00:00:00 2001 From: Aravind Neelakantan Date: Thu, 13 Nov 2025 18:25:16 -0600 Subject: [PATCH 10/10] Fixing race condition during file access testing for fsx lustre and openzfs --- .../LifecycleScripts/base-config/mount_fsx.sh | 6 ++++-- .../LifecycleScripts/base-config/mount_fsx_openzfs.sh | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh index d2f63bb86..fba3b7f5d 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh @@ -166,11 +166,13 @@ mount_fs() { local max_attempts=5 local attempt=1 local delay=5 + local test_file="$MOUNT_POINT/test_file_$(hostname)" echo "[INFO] Ensuring $MOUNT_POINT directory exists..." ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT state=directory" || true echo "[INFO] Mounting FSx Lustre on $MOUNT_POINT..." + echo "[INFO] Using test file: $test_file" while (( attempt <= max_attempts )); do echo "============================" @@ -193,13 +195,13 @@ mount_fs() { ls -la "$MOUNT_POINT" >/dev/null 2>&1 || true echo "[STEP] Testing file access (touch)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=touch"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then echo "[WARN] Touch failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi echo "[STEP] Testing file access (delete)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=absent"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then echo "[WARN] Delete failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh index ad866c011..d57320fa8 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh @@ -79,11 +79,13 @@ mount_fs() local max_attempts=5 local attempt=1 local delay=5 + local test_file="$OPENZFS_MOUNT_POINT/test_file_$(hostname)" echo "[INFO] Ensuring $OPENZFS_MOUNT_POINT directory exists..." ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT state=directory" || true echo "[INFO] Mounting FSx OpenZFS on $OPENZFS_MOUNT_POINT..." + echo "[INFO] Using test file: $test_file" while (( attempt <= max_attempts )); do echo "============================" @@ -107,13 +109,13 @@ mount_fs() ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1 || true echo "[STEP] Testing file access (touch)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT/test_file state=touch"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then echo "[WARN] Touch failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi echo "[STEP] Testing file access (delete)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT/test_file state=absent"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then echo "[WARN] Delete failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi