Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,13 @@ mount_fs() {
local max_attempts=5
local attempt=1
local delay=5
local test_file="$MOUNT_POINT/test_file_$(hostname)"

echo "[INFO] Ensuring $MOUNT_POINT directory exists..."
ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT state=directory" || true

echo "[INFO] Mounting FSx Lustre on $MOUNT_POINT..."
echo "[INFO] Using test file: $test_file"

while (( attempt <= max_attempts )); do
echo "============================"
Expand All @@ -193,13 +195,13 @@ mount_fs() {
ls -la "$MOUNT_POINT" >/dev/null 2>&1 || true

echo "[STEP] Testing file access (touch)..."
if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=touch"; then
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then
echo "[WARN] Touch failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi

echo "[STEP] Testing file access (delete)..."
if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=absent"; then
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then
echo "[WARN] Delete failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,65 @@ install_nfs_client()
# Mount the FSx OpenZFS file system
mount_fs()
{
# Create mount point directory if it doesn't exist
if [ ! -d "$OPENZFS_MOUNT_POINT" ]; then
mkdir -p "$OPENZFS_MOUNT_POINT"
fi
local max_attempts=5
local attempt=1
local delay=5
local test_file="$OPENZFS_MOUNT_POINT/test_file_$(hostname)"

echo "[INFO] Ensuring $OPENZFS_MOUNT_POINT directory exists..."
ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT state=directory" || true

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suppose we have 2 nodes, if they are all creating a global static test_file, do we need to worry about race conditions ?

T1: Instance A creates test_file
T2: Instance B creates test_file (same name)
T3: Instance A deletes test_file  
T4: Instance B tries to delete test_file → FAILS (already gone)
T5: Instance B thinks mount is broken, retries entire process

Would something like below help ?

TEST_FILE="$OPENZFS_MOUNT_POINT/test_file_${HOSTNAME}_$$"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a good catch. I can implement this here. Thanks for catching this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PremiumSpider This has been implemented now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested this as well and looks good.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for testing on your end as well.


echo "[INFO] Mounting FSx OpenZFS on $OPENZFS_MOUNT_POINT..."
echo "[INFO] Using test file: $test_file"

while (( attempt <= max_attempts )); do
echo "============================"
echo "[INFO] Attempt $attempt of $max_attempts"
echo "============================"

echo "[STEP] Mounting FSx OpenZFS..."
if ! ansible localhost -b -m ansible.posix.mount -a \
"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"; then
echo "[WARN] Mount command failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi

echo "[STEP] Verifying mountpoint..."
if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $OPENZFS_MOUNT_POINT"; then
echo "[WARN] Mountpoint verification failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi

retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible localhost -b -m ansible.posix.mount -a \"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted\""
echo "[STEP] Triggering automount..."
ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1 || true

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a potential silent error due to (...|| true)
Do we need to retry ? can we at least log if there is issue ?

Example

echo "[STEP] Triggering automount..."
if ! ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1; then
    echo "[WARN] Could not list mount directory contents"
    # log the issue
fi

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PremiumSpider This does qualify as a silent error due to (..|| true). But we don't need to log anything here because this is just triggering the automount. The ls command is trying to access the space which will mount the FS. We have explicit verification in the next step with touch and delete. I would leave as it is in my opinion.


echo "[STEP] Testing file access (touch)..."
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then
echo "[WARN] Touch failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi

echo "[STEP] Testing file access (delete)..."
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then
echo "[WARN] Delete failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi

echo "[SUCCESS] FSx OpenZFS mount succeeded on attempt $attempt"
return 0
done

echo "[ERROR] FSx OpenZFS mount failed after $max_attempts attempts"
return 1
}

# Verify mount was successful
verify_mount()
# Restart systemd daemon to ensure mount units are properly loaded
restart_daemon()
{
if ! mountpoint -q "$OPENZFS_MOUNT_POINT"; then
echo "Failed to verify mount point $OPENZFS_MOUNT_POINT"
exit 1
fi
ansible localhost -b -m ansible.builtin.systemd -a "daemon_reload=yes"
ansible localhost -b -m ansible.builtin.systemd -a "name=remote-fs.target state=restarted"
echo "Check status of OpenZFS automount..."
systemctl list-units | grep -i automount || true
}

main()
Expand All @@ -99,8 +143,8 @@ main()
echo "Using openzfs_mount_point: $OPENZFS_MOUNT_POINT"
verify_parameters
install_nfs_client
mount_fs
verify_mount
mount_fs || exit 1
restart_daemon
echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT"
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ FSX_OZFS_EXISTS=$1
FSX_OPENZFS_DNS_NAME="/home"
FSX_L_DNS_NAME="/fsx"

# Look for shared_users.txt in parent directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt"

# Function to check mount
check_mount()
{
Expand Down Expand Up @@ -36,56 +40,152 @@ wait_for_mount()
done
}

# Function to setup home directory for a user with OpenZFS
setup_user_home_openzfs()
{
local username=$1

echo "Setting up OpenZFS home directory for user: $username"

# Create user directory on OpenZFS
ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/$username' state=directory owner=$username group=$username mode=0755"

# Set home directory to /home/username
ansible localhost -b -m ansible.builtin.user -a "name=$username home='$FSX_OPENZFS_DNS_NAME/$username' move_home=yes"
echo "Home directory set to $FSX_OPENZFS_DNS_NAME/$username"

# Maintain access to /fsx/username
if wait_for_mount "$FSX_L_DNS_NAME"; then
sudo mkdir -p "$FSX_L_DNS_NAME/$username"
sudo chown "$username:$username" "$FSX_L_DNS_NAME/$username"
else
echo "Warning: FSx Lustre mount not available, skipping $FSX_L_DNS_NAME/$username setup"
fi
}

# Function to setup home directory for a user with FSx Lustre only
setup_user_home_fsx_lustre()
{
local username=$1
local fsx_home=$2

echo "Setting up FSx Lustre home directory for user: $username at $fsx_home"

if [ -d "$fsx_home" ]; then
sudo usermod -d "$fsx_home" "$username"
elif [ -d "$FSX_L_DNS_NAME" ]; then
# Create the directory
sudo mkdir -p "$fsx_home"
sudo chown "$username:$username" "$fsx_home"

# Try to change home directory with move
if ! sudo usermod -m -d "$fsx_home" "$username"; then
echo "Warning: Could not move home directory for $username. Setting home without moving files."

# If user has existing home, copy contents
if [ -d "/home/$username" ]; then
sudo rsync -a "/home/$username/" "$fsx_home/"
fi
sudo chown -R "$username:$username" "$fsx_home"

sudo usermod -d "$fsx_home" "$username"
else
echo "Home directory moved successfully to $fsx_home"
fi
fi
}

if [ -z "$FSX_OZFS_EXISTS" ]; then
echo "Error: Missing parameter. Usage: $0 <1|0> (1 if OpenZFS exists, 0 otherwise)"
exit 1
fi

# Check if OpenZFS is mounted
if [ $FSX_OZFS_EXISTS -eq 1 ]; then
echo "OpenZFS is mounted. Looping to ensure FSxOZFS is mounted."

echo "OpenZFS is mounted. Setting up home directories on OpenZFS."
if wait_for_mount "$FSX_OPENZFS_DNS_NAME"; then
ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/ubuntu' state=directory owner=ubuntu group=ubuntu mode=0755"

echo "OpenZFS is mounted at $FSX_OPENZFS_DNS_NAME"
# Set home directory to /home/ubuntu
ansible localhost -b -m ansible.builtin.user -a "name=ubuntu home='$FSX_OPENZFS_DNS_NAME/ubuntu' move_home=yes"
echo "Home directory set to $FSX_OPENZFS_DNS_NAME/ubuntu"

# Maintain access to /fsx/ubuntu
if wait_for_mount "$FSX_L_DNS_NAME"; then
sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu"
sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"
# Setup ubuntu user first
echo "Setting up home directory for default ubuntu user..."
setup_user_home_openzfs "ubuntu"

# Process additional users from shared_users.txt if it exists
if [[ -f $SHARED_USER_FILE ]]; then
echo "Found $SHARED_USER_FILE, processing additional users..."
echo "Contents of $SHARED_USER_FILE:"
cat "$SHARED_USER_FILE"

while IFS="," read -r username uid home; do
# Trim whitespace from all fields
username=$(echo "$username" | xargs)
uid=$(echo "$uid" | xargs)
home=$(echo "$home" | xargs)

# Skip empty lines or lines that are just whitespace
if [[ -z "$username" ]]; then
echo "Skipping empty or invalid line"
continue
fi

# Verify user exists before trying to set up home
if ! id -u "$username" >/dev/null 2>&1; then
echo "WARNING: User $username does not exist, skipping home setup"
continue
fi

echo "Processing home directory for user: '$username'"
setup_user_home_openzfs "$username"
done < "$SHARED_USER_FILE"

echo "All users from $SHARED_USER_FILE processed successfully"
else
echo "Warning: FSx mount not available, skipping $FSX_L_DNS_NAME/ubuntu setup"
echo "No $SHARED_USER_FILE found, only ubuntu user configured"
fi
fi
else
echo "OpenZFS is not mounted. Skipped OZFS check loop, and looping for FSxL only."
echo "Using FSxL file system as home..."

echo "OpenZFS is not mounted. Using FSx Lustre file system as home..."

if ! wait_for_mount "$FSX_L_DNS_NAME"; then
echo "Warning: FSx mount not available. Exiting."
exit 1
fi
if [ -d "$FSX_L_DNS_NAME/ubuntu" ]; then
sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
elif [ -d "$FSX_L_DNS_NAME" ]; then
# Create the directory (race condition: if it doesn't get detected)
sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu"
sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"

# Try to change home directory with move (race condition)
if ! sudo usermod -m -d "$FSX_L_DNS_NAME/ubuntu" ubuntu; then
echo "Warning: Could not move home directory. Setting home without moving files."

sudo rsync -a /home/ubuntu/ "$FSX_L_DNS_NAME/ubuntu/"
sudo chown -R ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"

sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
else
echo "Home directory moved successfully to $FSX_L_DNS_NAME/ubuntu"
fi

# Setup ubuntu user first
echo "Setting up home directory for default ubuntu user..."
setup_user_home_fsx_lustre "ubuntu" "$FSX_L_DNS_NAME/ubuntu"

# Process additional users from shared_users.txt if it exists
if [[ -f $SHARED_USER_FILE ]]; then
echo "Found $SHARED_USER_FILE, processing additional users..."
echo "Contents of $SHARED_USER_FILE:"
cat "$SHARED_USER_FILE"

while IFS="," read -r username uid home; do
# Trim whitespace from all fields
username=$(echo "$username" | xargs)
uid=$(echo "$uid" | xargs)
home=$(echo "$home" | xargs)

# Skip empty lines or lines that are just whitespace
if [[ -z "$username" ]] || [[ -z "$home" ]]; then
echo "Skipping empty or invalid line"
continue
fi

# Verify user exists before trying to set up home
if ! id -u "$username" >/dev/null 2>&1; then
echo "WARNING: User $username does not exist, skipping home setup"
continue
fi

echo "Processing home directory for user: '$username' at '$home'"
setup_user_home_fsx_lustre "$username" "$home"
done < "$SHARED_USER_FILE"

echo "All users from $SHARED_USER_FILE processed successfully"
else
echo "No $SHARED_USER_FILE found, only ubuntu user configured"
fi
fi
fi

echo "Home directory setup completed for all users"
Loading
Loading