diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh index d2f63bb86..fba3b7f5d 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh @@ -166,11 +166,13 @@ mount_fs() { local max_attempts=5 local attempt=1 local delay=5 + local test_file="$MOUNT_POINT/test_file_$(hostname)" echo "[INFO] Ensuring $MOUNT_POINT directory exists..." ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT state=directory" || true echo "[INFO] Mounting FSx Lustre on $MOUNT_POINT..." + echo "[INFO] Using test file: $test_file" while (( attempt <= max_attempts )); do echo "============================" @@ -193,13 +195,13 @@ mount_fs() { ls -la "$MOUNT_POINT" >/dev/null 2>&1 || true echo "[STEP] Testing file access (touch)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=touch"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then echo "[WARN] Touch failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi echo "[STEP] Testing file access (delete)..." - if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=absent"; then + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then echo "[WARN] Delete failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh index 4f068317d..d57320fa8 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh @@ -76,21 +76,65 @@ install_nfs_client() # Mount the FSx OpenZFS file system mount_fs() { - # Create mount point directory if it doesn't exist - if [ ! -d "$OPENZFS_MOUNT_POINT" ]; then - mkdir -p "$OPENZFS_MOUNT_POINT" - fi + local max_attempts=5 + local attempt=1 + local delay=5 + local test_file="$OPENZFS_MOUNT_POINT/test_file_$(hostname)" + + echo "[INFO] Ensuring $OPENZFS_MOUNT_POINT directory exists..." + ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT state=directory" || true + + echo "[INFO] Mounting FSx OpenZFS on $OPENZFS_MOUNT_POINT..." + echo "[INFO] Using test file: $test_file" + + while (( attempt <= max_attempts )); do + echo "============================" + echo "[INFO] Attempt $attempt of $max_attempts" + echo "============================" + + echo "[STEP] Mounting FSx OpenZFS..." + if ! ansible localhost -b -m ansible.posix.mount -a \ + "path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"; then + echo "[WARN] Mount command failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[STEP] Verifying mountpoint..." + if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $OPENZFS_MOUNT_POINT"; then + echo "[WARN] Mountpoint verification failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi - retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible localhost -b -m ansible.posix.mount -a \"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted\"" + echo "[STEP] Triggering automount..." + ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1 || true + + echo "[STEP] Testing file access (touch)..." + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then + echo "[WARN] Touch failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[STEP] Testing file access (delete)..." + if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then + echo "[WARN] Delete failed — retrying in $delay seconds" + sleep "$delay"; ((attempt++)); continue + fi + + echo "[SUCCESS] FSx OpenZFS mount succeeded on attempt $attempt" + return 0 + done + + echo "[ERROR] FSx OpenZFS mount failed after $max_attempts attempts" + return 1 } -# Verify mount was successful -verify_mount() +# Restart systemd daemon to ensure mount units are properly loaded +restart_daemon() { - if ! mountpoint -q "$OPENZFS_MOUNT_POINT"; then - echo "Failed to verify mount point $OPENZFS_MOUNT_POINT" - exit 1 - fi + ansible localhost -b -m ansible.builtin.systemd -a "daemon_reload=yes" + ansible localhost -b -m ansible.builtin.systemd -a "name=remote-fs.target state=restarted" + echo "Check status of OpenZFS automount..." + systemctl list-units | grep -i automount || true } main() @@ -99,8 +143,8 @@ main() echo "Using openzfs_mount_point: $OPENZFS_MOUNT_POINT" verify_parameters install_nfs_client - mount_fs - verify_mount + mount_fs || exit 1 + restart_daemon echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT" } diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh index 6b6eeb850..afbfd032d 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh @@ -7,6 +7,10 @@ FSX_OZFS_EXISTS=$1 FSX_OPENZFS_DNS_NAME="/home" FSX_L_DNS_NAME="/fsx" +# Look for shared_users.txt in parent directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt" + # Function to check mount check_mount() { @@ -36,6 +40,61 @@ wait_for_mount() done } +# Function to setup home directory for a user with OpenZFS +setup_user_home_openzfs() +{ + local username=$1 + + echo "Setting up OpenZFS home directory for user: $username" + + # Create user directory on OpenZFS + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/$username' state=directory owner=$username group=$username mode=0755" + + # Set home directory to /home/username + ansible localhost -b -m ansible.builtin.user -a "name=$username home='$FSX_OPENZFS_DNS_NAME/$username' move_home=yes" + echo "Home directory set to $FSX_OPENZFS_DNS_NAME/$username" + + # Maintain access to /fsx/username + if wait_for_mount "$FSX_L_DNS_NAME"; then + sudo mkdir -p "$FSX_L_DNS_NAME/$username" + sudo chown "$username:$username" "$FSX_L_DNS_NAME/$username" + else + echo "Warning: FSx Lustre mount not available, skipping $FSX_L_DNS_NAME/$username setup" + fi +} + +# Function to setup home directory for a user with FSx Lustre only +setup_user_home_fsx_lustre() +{ + local username=$1 + local fsx_home=$2 + + echo "Setting up FSx Lustre home directory for user: $username at $fsx_home" + + if [ -d "$fsx_home" ]; then + sudo usermod -d "$fsx_home" "$username" + elif [ -d "$FSX_L_DNS_NAME" ]; then + # Create the directory + sudo mkdir -p "$fsx_home" + sudo chown "$username:$username" "$fsx_home" + + # Try to change home directory with move + if ! sudo usermod -m -d "$fsx_home" "$username"; then + echo "Warning: Could not move home directory for $username. Setting home without moving files." + + # If user has existing home, copy contents + if [ -d "/home/$username" ]; then + sudo rsync -a "/home/$username/" "$fsx_home/" + fi + sudo chown -R "$username:$username" "$fsx_home" + + sudo usermod -d "$fsx_home" "$username" + else + echo "Home directory moved successfully to $fsx_home" + fi + fi +} + if [ -z "$FSX_OZFS_EXISTS" ]; then echo "Error: Missing parameter. Usage: $0 <1|0> (1 if OpenZFS exists, 0 otherwise)" exit 1 @@ -43,49 +102,90 @@ fi # Check if OpenZFS is mounted if [ $FSX_OZFS_EXISTS -eq 1 ]; then - echo "OpenZFS is mounted. Looping to ensure FSxOZFS is mounted." - + echo "OpenZFS is mounted. Setting up home directories on OpenZFS." + if wait_for_mount "$FSX_OPENZFS_DNS_NAME"; then - ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/ubuntu' state=directory owner=ubuntu group=ubuntu mode=0755" - - echo "OpenZFS is mounted at $FSX_OPENZFS_DNS_NAME" - # Set home directory to /home/ubuntu - ansible localhost -b -m ansible.builtin.user -a "name=ubuntu home='$FSX_OPENZFS_DNS_NAME/ubuntu' move_home=yes" - echo "Home directory set to $FSX_OPENZFS_DNS_NAME/ubuntu" - - # Maintain access to /fsx/ubuntu - if wait_for_mount "$FSX_L_DNS_NAME"; then - sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu" - sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" + # Setup ubuntu user first + echo "Setting up home directory for default ubuntu user..." + setup_user_home_openzfs "ubuntu" + + # Process additional users from shared_users.txt if it exists + if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" + + while IFS="," read -r username uid home; do + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace + if [[ -z "$username" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up home + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping home setup" + continue + fi + + echo "Processing home directory for user: '$username'" + setup_user_home_openzfs "$username" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" else - echo "Warning: FSx mount not available, skipping $FSX_L_DNS_NAME/ubuntu setup" + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi fi else - echo "OpenZFS is not mounted. Skipped OZFS check loop, and looping for FSxL only." - echo "Using FSxL file system as home..." - + echo "OpenZFS is not mounted. Using FSx Lustre file system as home..." + if ! wait_for_mount "$FSX_L_DNS_NAME"; then echo "Warning: FSx mount not available. Exiting." exit 1 fi - if [ -d "$FSX_L_DNS_NAME/ubuntu" ]; then - sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu - elif [ -d "$FSX_L_DNS_NAME" ]; then - # Create the directory (race condition: if it doesn't get detected) - sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu" - sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" - - # Try to change home directory with move (race condition) - if ! sudo usermod -m -d "$FSX_L_DNS_NAME/ubuntu" ubuntu; then - echo "Warning: Could not move home directory. Setting home without moving files." - - sudo rsync -a /home/ubuntu/ "$FSX_L_DNS_NAME/ubuntu/" - sudo chown -R ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu" - - sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu - else - echo "Home directory moved successfully to $FSX_L_DNS_NAME/ubuntu" - fi + + # Setup ubuntu user first + echo "Setting up home directory for default ubuntu user..." + setup_user_home_fsx_lustre "ubuntu" "$FSX_L_DNS_NAME/ubuntu" + + # Process additional users from shared_users.txt if it exists + if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" + + while IFS="," read -r username uid home; do + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace + if [[ -z "$username" ]] || [[ -z "$home" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up home + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping home setup" + continue + fi + + echo "Processing home directory for user: '$username' at '$home'" + setup_user_home_fsx_lustre "$username" "$home" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" + else + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi -fi \ No newline at end of file +fi + +echo "Home directory setup completed for all users" \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh index b2c32af77..9a71cb752 100755 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/gen-keypair-ubuntu.sh @@ -2,52 +2,112 @@ set -exuo pipefail -FSX_DIR="/fsx/ubuntu" -FSX_OZFS_DIR="/home/ubuntu" +# Look for shared_users.txt in parent directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt" -mkdir -p $FSX_DIR/.ssh - -# Creating symlink between /fsx/ubuntu/.ssh and /home/ubuntu/.ssh -if [ -d "$FSX_OZFS_DIR" ]; then - if [ -L "$FSX_OZFS_DIR/.ssh" ]; then - echo "$FSX_OZFS_DIR/.ssh is already a symbolic link" - elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then - echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." - rm -rf "$FSX_OZFS_DIR/.ssh" - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link" +# Function to setup SSH keys for a single user +setup_user_ssh() { + local username=$1 + local fsx_home=$2 + + echo "Setting up SSH keys for user: $username" + + local FSX_DIR="$fsx_home" + local FSX_OZFS_DIR="/home/$username" + + # Create .ssh directory on FSx + mkdir -p "$FSX_DIR/.ssh" + + # Creating symlink between /fsx/username/.ssh and /home/username/.ssh (if OpenZFS is mounted) + if [ -d "$FSX_OZFS_DIR" ]; then + if [ -L "$FSX_OZFS_DIR/.ssh" ]; then + echo "$FSX_OZFS_DIR/.ssh is already a symbolic link" + # Ensure symlink ownership is correct + chown -h "$username:$username" "$FSX_OZFS_DIR/.ssh" + elif [ -e "$FSX_OZFS_DIR/.ssh" ]; then + echo "Removing existing $FSX_OZFS_DIR/.ssh and creating symbolic link..." + rm -rf "$FSX_OZFS_DIR/.ssh" + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes owner=$username group=$username" + else + echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." + ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link force=yes owner=$username group=$username" + fi + fi + + cd "$FSX_DIR/.ssh" + + # Check if id_rsa exists + if [ ! -f id_rsa ]; then + GENERATE_KEYPAIR=1 else - echo "Linking $FSX_DIR/.ssh to $FSX_OZFS_DIR/.ssh..." - ansible localhost -b -m ansible.builtin.file -a "src='$FSX_DIR/.ssh' dest='$FSX_OZFS_DIR/.ssh' state=link" + GENERATE_KEYPAIR=0 + # Check if id_rsa.pub exists in authorized_keys + if ! grep -qF "$(cat id_rsa.pub)" authorized_keys 2>/dev/null; then + # If not, add the public key to authorized_keys + cat id_rsa.pub >> authorized_keys + fi fi -fi - -cd $FSX_DIR/.ssh - -# Check if id_rsa exists -if [ ! -f id_rsa ]; then - GENERATE_KEYPAIR=1 -else - GENERATE_KEYPAIR=0 - # Check if id_rsa.pub exists in authorized_keys - if ! grep -qF "$(cat id_rsa.pub)" authorized_keys 2>/dev/null; then - # If not, add the public key to authorized_keys + + if [[ $GENERATE_KEYPAIR == 1 ]]; then + echo "Generate a new keypair for $username..." + ssh-keygen -t rsa -b 4096 -q -f id_rsa -N "" 2>/dev/null || true cat id_rsa.pub >> authorized_keys + else + echo "Use existing keypair for $username..." fi -fi -if [[ $GENERATE_KEYPAIR == 1 ]]; then - echo Generate a new keypair... - ssh-keygen -t rsa -b 4096 -q -f id_rsa -N "" 2>/dev/null || true - cat id_rsa.pub >> authorized_keys + + # Set permissions for the ssh keypair + ansible localhost -m ansible.builtin.file -a "path=$FSX_DIR/.ssh/authorized_keys state=touch" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa' owner=$username group=$username mode=0600" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa.pub' owner=$username group=$username mode=0644" + ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/authorized_keys' owner=$username group=$username mode=0600" + + # Set permissions for the .ssh directory + chmod 700 "$FSX_DIR/.ssh" + + # Change ownership to the user + chown -R "$username:$username" "$FSX_DIR/.ssh" + + echo "SSH setup completed for user: $username" +} + +# Always setup ubuntu user first +echo "Setting up SSH keys for default ubuntu user..." +setup_user_ssh "ubuntu" "/fsx/ubuntu" + +# Process additional users from shared_users.txt if it exists +if [[ -f $SHARED_USER_FILE ]]; then + echo "Found $SHARED_USER_FILE, processing additional users..." + echo "Contents of $SHARED_USER_FILE:" + cat "$SHARED_USER_FILE" + echo "---" + + while IFS="," read -r username uid home; do + # Trim whitespace from all fields + username=$(echo "$username" | xargs) + uid=$(echo "$uid" | xargs) + home=$(echo "$home" | xargs) + + # Skip empty lines or lines that are just whitespace + if [[ -z "$username" ]] || [[ -z "$home" ]]; then + echo "Skipping empty or invalid line" + continue + fi + + # Verify user exists before trying to set up SSH + if ! id -u "$username" >/dev/null 2>&1; then + echo "WARNING: User $username does not exist, skipping SSH setup" + continue + fi + + echo "Processing user: '$username' with uid: '$uid' and home: '$home'" + setup_user_ssh "$username" "$home" + done < "$SHARED_USER_FILE" + + echo "All users from $SHARED_USER_FILE processed successfully" else - echo Use existing keypair... + echo "No $SHARED_USER_FILE found, only ubuntu user configured" fi -# (diff: do this regardless of if new kp is generated to ensure consistent permissions) -# Set permissions for the ssh keypair -ansible localhost -m ansible.builtin.file -a "path=$FSX_DIR/.ssh/authorized_keys state=touch" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa' owner=ubuntu group=ubuntu mode=0600" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/id_rsa.pub' owner=ubuntu group=ubuntu mode=0644" -ansible localhost -b -m ansible.builtin.file -a "path='$FSX_DIR/.ssh/authorized_keys' owner=ubuntu group=ubuntu mode=0600" -# Set permissions for the .ssh directory -chmod 700 $FSX_DIR/.ssh -# Change ownership to the ubuntu user \ No newline at end of file +echo "SSH key generation completed for all users" \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh b/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh index aae73093d..e029b0635 100755 --- a/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh +++ b/1.architectures/5.sagemaker-hyperpod/automate-smhp-slurm/automate-cluster-creation.sh @@ -995,7 +995,11 @@ configure_cluster_users() { read -p "Hit ENTER once step is complete: " echo -e "\n3. Change the Linux shell profile." - echo -e " It should have '/bin/bash -c 'export HOME=/fsx/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + if [[ "$ENABLE_FSX_OPENZFS" == "true" ]]; then + echo -e " It should have '/bin/bash -c 'export HOME=/home/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + else + echo -e " It should have '/bin/bash -c 'export HOME=/fsx/\$(whoami) && cd \${HOME} && exec /bin/bash' in its first and only line" + fi read -p "Hit ENTER once you've added this line in: " echo -e "\n${GREEN}✅ SSM Run As support configured successfully${NC}" diff --git a/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh b/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh index bd007129e..a32c26ff4 100755 --- a/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh +++ b/1.architectures/5.sagemaker-hyperpod/easy-ssh.sh @@ -6,6 +6,7 @@ declare -a HELP=( "[-h|--help]" "[-c|--controller-group]" + "[-u|--user]" "[-r|--region]" "[-p|--profile]" "[-d|--dry-run]" @@ -14,6 +15,7 @@ declare -a HELP=( cluster_name="" node_group="controller-machine" +ssh_user="ubuntu" declare -a aws_cli_args=() DRY_RUN=0 @@ -25,12 +27,33 @@ parse_args() { -h|--help) echo "Access a HyperPod Slurm controller via ssh-over-ssm." echo "Usage: $(basename ${BASH_SOURCE[0]}) ${HELP[@]}" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -c, --controller-group Specify the controller group name (default: controller-machine)" + echo " -u, --user Specify the SSH user (default: ubuntu)" + echo " -r, --region Specify AWS region" + echo " -p, --profile Specify AWS profile" + echo " -d, --dry-run Show the SSM command without executing" + echo "" + echo "Examples:" + echo " $(basename ${BASH_SOURCE[0]}) ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -c login-group ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -u user1 ml-cluster" + echo " $(basename ${BASH_SOURCE[0]}) -u user2 -r us-west-2 ml-cluster" + echo "" + echo "Note: For non-ubuntu users, ensure your IAM user has the SSMSessionRunAs tag" + echo " set to the desired OS username for passwordless login." exit 0 ;; -c|--controller-group) node_group="$2" shift 2 ;; + -u|--user) + ssh_user="$2" + shift 2 + ;; -r|--region) aws_cli_args+=(--region "$2") shift 2 @@ -55,12 +78,19 @@ parse_args() { [[ "$cluster_name" == "" ]] && { echo "Must define a cluster name" ; exit -1 ; } } -# Function to check if ml-cluster exists in ~/.ssh/config +# Function to check if cluster config exists in ~/.ssh/config check_ssh_config() { - if grep -wq "Host ${cluster_name}$" ~/.ssh/config; then - echo -e "${BLUE}1. Detected ${GREEN}${cluster_name}${BLUE} in ${GREEN}~/.ssh/config${BLUE}. Skipping adding...${NC}" + local ssh_host="${cluster_name}" + + # If user is not ubuntu, append username to host for unique config entry + if [[ "$ssh_user" != "ubuntu" ]]; then + ssh_host="${cluster_name}-${ssh_user}" + fi + + if grep -wq "Host ${ssh_host}$" ~/.ssh/config; then + echo -e "${BLUE}1. Detected ${GREEN}${ssh_host}${BLUE} in ${GREEN}~/.ssh/config${BLUE}. Skipping adding...${NC}" else - echo -e "${BLUE}Would you like to add ${GREEN}${cluster_name}${BLUE} to ~/.ssh/config (yes/no)?${NC}" + echo -e "${BLUE}Would you like to add ${GREEN}${ssh_host}${BLUE} to ~/.ssh/config (yes/no)?${NC}" read -p "> " ADD_CONFIG if [[ $ADD_CONFIG == "yes" ]]; then @@ -68,16 +98,19 @@ check_ssh_config() { mkdir -p ~/.ssh touch ~/.ssh/config fi - echo -e "${GREEN}✅ adding ml-cluster to ~/.ssh/config:${NC}" + echo -e "${GREEN}✅ adding ${ssh_host} to ~/.ssh/config:${NC}" cat <> ~/.ssh/config -Host ${cluster_name} - User ubuntu +Host ${ssh_host} + User ${ssh_user} ProxyCommand sh -c "aws ssm start-session ${aws_cli_args[@]} --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AWS-StartSSHSession --parameters 'portNumber=%p'" EOL else - echo -e "${GREEN}❌ skipping adding ml-cluster to ~/.ssh/config:" + echo -e "${GREEN}❌ skipping adding ${ssh_host} to ~/.ssh/config${NC}" fi fi + + # Store the ssh_host for later use + SSH_HOST="${ssh_host}" } escape_spaces() { @@ -88,21 +121,33 @@ escape_spaces() { # Function to add the user's SSH public key to the cluster add_keypair_to_cluster() { PUBLIC_KEY=$(cat ~/.ssh/id_rsa.pub) + + # Determine the authorized_keys path based on user and filesystem + # Check if OpenZFS is mounted (home directory would be /home/username) + local auth_keys_path="/fsx/${ssh_user}/.ssh/authorized_keys" + + # Try to detect if user home is on OpenZFS + local home_check=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="getent passwd ${ssh_user} | cut -d: -f6" 2>/dev/null || echo "") + + if echo "$home_check" | grep -q "^/home/${ssh_user}"; then + # User home is on OpenZFS, but .ssh is symlinked to /fsx + auth_keys_path="/fsx/${ssh_user}/.ssh/authorized_keys" + fi # Check if the fingerprint already exists in the cluster's authorized_keys - EXISTING_KEYS=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="cat /fsx/ubuntu/.ssh/authorized_keys") + EXISTING_KEYS=$(aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="cat ${auth_keys_path}" 2>/dev/null || echo "") if echo "$EXISTING_KEYS" | grep -q "$PUBLIC_KEY"; then - echo -e "${BLUE}2. Detected SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} on the cluster. Skipping adding...${NC}" + echo -e "${BLUE}2. Detected SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} for user ${GREEN}${ssh_user}${BLUE} on the cluster. Skipping adding...${NC}" return else - echo -e "${BLUE}2. Do you want to add your SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} to the cluster (yes/no)?${NC}" + echo -e "${BLUE}2. Do you want to add your SSH public key ${GREEN}~/.ssh/id_rsa.pub${BLUE} to user ${GREEN}${ssh_user}${BLUE} on the cluster (yes/no)?${NC}" read -p "> " ADD_KEYPAIR if [[ $ADD_KEYPAIR == "yes" ]]; then echo "Adding ... ${PUBLIC_KEY}" - command="sed -i \$a$(escape_spaces "$PUBLIC_KEY") /fsx/ubuntu/.ssh/authorized_keys" + command="sed -i \$a$(escape_spaces "$PUBLIC_KEY") ${auth_keys_path}" aws ssm start-session --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document-name AmazonEKS-ExecuteNonInteractiveCommand --parameters command="$command" - echo "✅ Your SSH public key ~/.ssh/id_rsa.pub has been added to the cluster." + echo "✅ Your SSH public key ~/.ssh/id_rsa.pub has been added to user ${ssh_user} on the cluster." else echo "❌ Skipping adding SSH public key to the cluster." fi @@ -141,18 +186,30 @@ fi echo -e "Cluster id: ${GREEN}${cluster_id}${NC}" echo -e "Instance id: ${GREEN}${instance_id}${NC}" echo -e "Node Group: ${GREEN}${node_group}${NC}" +echo -e "SSH User: ${GREEN}${ssh_user}${NC}" check_ssh_config add_keypair_to_cluster echo -e "\nNow you can run:\n" -echo -e "$ ${GREEN}ssh ${cluster_name}${NC}" +echo -e "$ ${GREEN}ssh ${SSH_HOST}${NC}" [[ DRY_RUN -eq 1 ]] && echo -e "\n${GREEN}aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id}${NC}\n" && exit 0 -# Start session as Ubuntu only if the SSM-SessionManagerRunShellAsUbuntu document exists. -if aws ssm describe-document "${aws_cli_args[@]}" --name SSM-SessionManagerRunShellAsUbuntu > /dev/null 2>&1; then - aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document SSM-SessionManagerRunShellAsUbuntu +# Determine which SSM document to use based on the user +if [[ "$ssh_user" == "ubuntu" ]]; then + # Start session as Ubuntu if the SSM-SessionManagerRunShellAsUbuntu document exists + if aws ssm describe-document "${aws_cli_args[@]}" --name SSM-SessionManagerRunShellAsUbuntu > /dev/null 2>&1; then + aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} --document SSM-SessionManagerRunShellAsUbuntu + else + aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} + fi else + # For non-ubuntu users, check if they have an IAM user with SSMSessionRunAs tag + echo -e "${BLUE}Connecting as user: ${GREEN}${ssh_user}${NC}" + echo -e "${YELLOW}Note: Make sure the IAM user has the SSMSessionRunAs tag set to '${ssh_user}' for passwordless login${NC}" + echo -e "${YELLOW}See: https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-getting-started-enable-ssh-connections.html${NC}" + + # Use standard SSM session - the SSMSessionRunAs tag on the IAM user will determine which OS user to use aws ssm start-session "${aws_cli_args[@]}" --target sagemaker-cluster:${cluster_id}_${node_group}-${instance_id} fi