From a5a6b8d6be7efbe827d7c3dbd51468a65c4ad052 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 3 Dec 2025 16:37:16 -0500 Subject: [PATCH 1/4] Fix DCV on Ubuntu 22.04+ on DLAMI by disabling Wayland Disable Wayland in GDM to ensure Xorg is used on headless GPU instances. Ubuntu 22.04+ defaults to Wayland which causes GDM startup issues with NVIDIA drivers and NICE DCV. Force Xorg by setting WaylandEnable=false in /etc/gdm3/custom.conf. --- .../resources/dcv/partial/_ubuntu_common.rb | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb index 2eb8a40cc..4c490468f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/dcv/partial/_ubuntu_common.rb @@ -82,6 +82,62 @@ def optionally_disable_rnd end end + # Disable Wayland in GDM to ensure Xorg is used + # This is required for Ubuntu 22.04+ where Wayland is the default + # Without this, GDM won't start Xorg on headless GPU instances + def disable_wayland + bash 'Disable Wayland in GDM' do + user 'root' + code <<-DISABLEWAYLAND + set -e + if [ -f /etc/gdm3/custom.conf ]; then + sed -i 's/#WaylandEnable=false/WaylandEnable=false/' /etc/gdm3/custom.conf + # If the line doesn't exist at all, add it under [daemon] section + if ! grep -q "^WaylandEnable=false" /etc/gdm3/custom.conf; then + sed -i '/\\[daemon\\]/a WaylandEnable=false' /etc/gdm3/custom.conf + fi + fi + DISABLEWAYLAND + end + end + + # Override allow_gpu_acceleration to disable Wayland before starting X + def allow_gpu_acceleration + # Update the xorg.conf to set up NVIDIA drivers. + # NOTE: --enable-all-gpus parameter is needed to support servers with more than one NVIDIA GPU. + nvidia_xconfig_command = "nvidia-xconfig --preserve-busid --enable-all-gpus" + nvidia_xconfig_command += " --use-display-device=none" if node['ec2']['instance_type'].start_with?("g2.") + execute "Set up Nvidia drivers for X configuration" do + user 'root' + command nvidia_xconfig_command + end + + # dcvgl package must be installed after NVIDIA and before starting up X + # DO NOT install dcv-gl on non-GPU instances, or will run into a black screen issue + install_dcv_gl + + # Disable Wayland to ensure GDM starts Xorg + disable_wayland + + # Configure the X server to start automatically when the Linux server boots and start the X server in background + bash 'Launch X' do + user 'root' + code <<-SETUPX + set -e + systemctl set-default graphical.target + systemctl isolate graphical.target & + SETUPX + end + + # Verify that the X server is running + execute 'Wait for X to start' do + user 'root' + command "pidof X || pidof Xorg" + retries 10 + retry_delay 5 + end + end + def post_install # ubuntu-desktop comes with NetworkManager. On a cloud instance NetworkManager is unnecessary and causes delay. # Instruct Netplan to use networkd for better performance From 02c85bad1fc65195e864526b0f384021d06b1496 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 4 Dec 2025 09:56:19 -0500 Subject: [PATCH 2/4] Add changelog. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b377bd897..501002536 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste and achieve better performance at scale. - Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. - Reduce dependency footprint by installing the package `sssd-common` rather than `sssd`. +- Disable Wayland in GDM on Ubuntu 22.04+ to ensure Xorg is used on headless GPU instances. - Upgrade Slurm to version 24.11.7 (from 24.11.6). - Upgrade Pmix to 5.0.9 (from 5.0.6). - Upgrade libjwt to version 1.18.4 (from 1.17.0) for all OSs except Amazon Linux 2. From 06ef920fb97f4bccbe845ce045363b9419230aff Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 4 Dec 2025 16:10:14 -0500 Subject: [PATCH 3/4] Refine changelog. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 501002536..902979361 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste and achieve better performance at scale. - Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel. - Reduce dependency footprint by installing the package `sssd-common` rather than `sssd`. -- Disable Wayland in GDM on Ubuntu 22.04+ to ensure Xorg is used on headless GPU instances. +- Disable Wayland protocol in GDM3 for Ubuntu 22.04+ to force the use of Xorg on GPU instances running without a display. - Upgrade Slurm to version 24.11.7 (from 24.11.6). - Upgrade Pmix to 5.0.9 (from 5.0.6). - Upgrade libjwt to version 1.18.4 (from 1.17.0) for all OSs except Amazon Linux 2. From 56e7c9dd42b905a4d022ef71dd7dd1faaf273779 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Fri, 5 Dec 2025 13:47:23 -0500 Subject: [PATCH 4/4] Add kitchen test to check if GDM is using X11 session type --- .../test/controls/dcv_spec.rb | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb index 9d901205f..96baa91bd 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/dcv_spec.rb @@ -318,3 +318,27 @@ end end end + +control 'tag:config_dcv_xorg_running_with_x11_session_type' do + title 'Check that Xorg is running and GDM is using X11 session type (not Wayland)' + only_if do + !os_properties.on_docker? && + instance.head_node? && + instance.dcv_installed? && + node['cluster']['dcv_enabled'] == "head_node" && + instance.graphic? && + instance.nvidia_installed? && + instance.dcv_gpu_accel_supported? + end + + describe 'Xorg process should be running' do + subject { command('pidof Xorg || pidof X') } + its('exit_status') { should eq 0 } + its('stdout') { should_not be_empty } + end + + describe 'GDM should be using X11 session type, not Wayland' do + subject { command("loginctl show-session $(loginctl | grep gdm | awk '{print $1}') -p Type 2>/dev/null | grep -i x11") } + its('exit_status') { should eq 0 } + end +end