From 94222ea02187ed359b871b33505f6ebde19946f1 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 30 Aug 2023 13:15:08 +0200 Subject: [PATCH 1/2] Avoid to fail when building RHEL8.7 AMI with kernel not supported by Lustre The latest RHEL8.7 kernel 4.18.0-425.19.2.el8_7 introduced a kABI regression, causing Lustre to re-release the client compiled for this latest version. This change means that this same client will not modprobe for kernel 4.18.0-425.13.1.el8_7. With this patch we're skipping Lustre installation and printing a warning in the log, like we did for 4.18.0-425.3.1.el8 that has a similar issue. ### Tests EC2: ``` # RHEL-8.7.0_HVM-20230330-x86_64-56-Hourly2-GP2 in eu-west-1 # with kernel 4.18.0-425.13.1.el8_7 export KITCHEN_RHEL8_AMI=ami-0fa2f7b35eeb82b7a bash kitchen.ec2.sh environment-install test lustre-rhel8 ``` Before the patch: this test was failing on converge phase, After the patch: converge is passing and printing the following WARN message: ``` WARN: FSx for Lustre is not supported in kernel version 4.18.0-425.13.1.el8_7.x86_64 of RHEL 8.7, please update the kernel version ``` Docker: ``` bash kitchen.docker.sh environment-install test lustre-rhel8 ``` This test passes because in docker we have RHEL 8.8 with a fake value for kernel version and we're skipping modprobe command. Use 477 version in the kernel release test variable to match 8.8 kernel version available on docker. ### References * 4.18.0-425.3.1 issue: https://access.redhat.com/solutions/6985596 * 4.18.0-425.19.2 issue: https://github.com/openzfs/zfs/issues/14724 Signed-off-by: Enrico Usai --- .../resources/lustre/lustre_redhat8.rb | 8 ++-- .../spec/unit/resources/lustre_setup_spec.rb | 28 +++++++------ .../test/controls/lustre_spec.rb | 40 ++++++++++++++++--- kitchen.docker.yml | 2 +- 4 files changed, 55 insertions(+), 23 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/resources/lustre/lustre_redhat8.rb b/cookbooks/aws-parallelcluster-environment/resources/lustre/lustre_redhat8.rb index c3c9ef162d..ae97d04324 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/lustre/lustre_redhat8.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/lustre/lustre_redhat8.rb @@ -25,13 +25,15 @@ action :setup do version = node['platform_version'] + log "Installing FSx for Lustre. Platform version: #{version}, kernel version: #{node['cluster']['kernel_release']}" if version.to_f < 8.2 log "FSx for Lustre is not supported in this RHEL version #{version}, supported versions are >= 8.2" do level :warn end - # rhel8 kernel 4.18.0-425.3.1.el8 has broken kABI compat https://github.com/openzfs/zfs/issues/14724 - elsif node['cluster']['kernel_release'].include? "4.18.0-425.3.1.el8" - log "FSx for Lustre is not supported in kernel version 4.18.0-425.3.1.el8 of RHEL, please update the kernel version" do + elsif version.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7")) + # Rhel8.7 kernel 4.18.0-425.3.1.el8 and 4.18.0-425.13.1.el8_7 has broken kABI compat + # See https://access.redhat.com/solutions/6985596 and https://github.com/openzfs/zfs/issues/14724 + log "FSx for Lustre is not supported in kernel version #{node['cluster']['kernel_release']} of RHEL #{version}, please update the kernel version" do level :warn end else diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_setup_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_setup_spec.rb index 12d58f1d88..be457e9668 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_setup_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_setup_spec.rb @@ -171,21 +171,23 @@ def self.setup(chef_run) end end - context "on redhat with kernel 4.18.0-425.3.1.el8" do - cached(:chef_run) do - runner = runner( - platform: 'redhat', version: '8', - step_into: ['lustre'] - ) do |node| - node.automatic['platform_version'] = "8.2" - node.override['cluster']['kernel_release'] = "anything 4.18.0-425.3.1.el8 something" + [%w(8.7 4.18.0-425.3.1.el8.x86_64), %w(8.7 4.18.0-425.13.1.el8_7.x86_64)].each do |platform_version, kernel_version| + context "on redhat #{platform_version} with kernel #{kernel_version}" do + cached(:chef_run) do + runner = runner( + platform: 'redhat', version: '8', + step_into: ['lustre'] + ) do |node| + node.automatic['platform_version'] = platform_version + node.override['cluster']['kernel_release'] = kernel_version + end + Lustre.setup(runner) end - Lustre.setup(runner) - end - it 'can not install lustre' do - is_expected.to write_log("FSx for Lustre is not supported in kernel version 4.18.0-425.3.1.el8 of RHEL, please update the kernel version") - .with(level: :warn) + it 'can not install lustre' do + is_expected.to write_log("FSx for Lustre is not supported in kernel version #{kernel_version} of RHEL #{platform_version}, please update the kernel version") + .with(level: :warn) + end end end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb index 7da903bb30..67f4c5371b 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb @@ -1,7 +1,7 @@ control 'tag:install_lustre_client_installed' do title "Verify that lustre client is installed" minimal_lustre_client_version = '2.12' - if (os_properties.centos? && inspec.os.release.to_f >= 7.5) || os_properties.redhat? + if os_properties.centos? && inspec.os.release.to_f >= 7.5 describe package('kmod-lustre-client') do it { should be_installed } end @@ -10,7 +10,33 @@ it { should be_installed } end - if (os_properties.centos? && inspec.os.release.to_f >= 7.7) || os_properties.redhat? + if os_properties.centos? && inspec.os.release.to_f >= 7.7 + describe package('kmod-lustre-client') do + its('version') { should cmp >= minimal_lustre_client_version } + end + + describe package('lustre-client') do + its('version') { should cmp >= minimal_lustre_client_version } + end + + describe yum.repo('aws-fsx') do + it { should exist } + it { should be_enabled } + its('baseurl') { should include 'fsx-lustre-client-repo.s3.amazonaws.com' } + end + end + end + + if os_properties.redhat? && inspec.os.release.to_f >= 8.2 + unless inspec.os.release.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7")) + describe package('kmod-lustre-client') do + it { should be_installed } + end + + describe package('lustre-client') do + it { should be_installed } + end + describe package('kmod-lustre-client') do its('version') { should cmp >= minimal_lustre_client_version } end @@ -60,10 +86,12 @@ control 'tag:install_lustre_lnet_kernel_module_enabled' do title "Verify that lnet kernel module is enabled" only_if { !os_properties.on_docker? && !os_properties.alinux2? } - describe kernel_module("lnet") do - it { should be_loaded } - it { should_not be_disabled } - it { should_not be_blacklisted } + unless os_properties.redhat? && inspec.os.release.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7")) + describe kernel_module("lnet") do + it { should be_loaded } + it { should_not be_disabled } + it { should_not be_blacklisted } + end end end diff --git a/kitchen.docker.yml b/kitchen.docker.yml index 3c2feb29c9..cb196d02c4 100644 --- a/kitchen.docker.yml +++ b/kitchen.docker.yml @@ -68,4 +68,4 @@ platforms: attributes: cluster: base_os: rhel8 - kernel_release: '4.18.0-425.13.1.el8_7.x86_64' + kernel_release: '4.18.0-477.13.1.el8_7.x86_64' # Use 477 version to match 8.8 kernel version available on docker From 3abb7488f9868acabe08d1216a3f809e61f148d7 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 31 Aug 2023 11:49:20 +0200 Subject: [PATCH 2/2] Add "fake-value" word when overriding kernel release We're overriding the kernel_release variable on docker because it does not have the same format of EC2. Adding the "fake-value" word in the kernel version simplifies debugging on docker. Signed-off-by: Enrico Usai --- kitchen.docker.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kitchen.docker.yml b/kitchen.docker.yml index cb196d02c4..62a7b69d87 100644 --- a/kitchen.docker.yml +++ b/kitchen.docker.yml @@ -38,14 +38,14 @@ platforms: attributes: cluster: base_os: alinux2 - kernel_release: '5.10.157-139.675.amzn2.x86_64' + kernel_release: '5.10.157-139.675.amzn2.fake-value' - name: centos7 driver: image: <% if ENV['KITCHEN_CENTOS7_IMAGE'] %> <%= ENV['KITCHEN_CENTOS7_IMAGE'] %> <% else %> dokken/centos-7 <% end %> attributes: cluster: base_os: centos7 - kernel_release: '3.10.0-1160.76.1.el7.x86_64' + kernel_release: '3.10.0-1160.76.1.el7.fake-value' - name: ubuntu2004 driver: image: <% if ENV['KITCHEN_UBUNTU2004_IMAGE'] %> <%= ENV['KITCHEN_UBUNTU2004_IMAGE'] %> <% else %> dokken/ubuntu-20.04 <% end %> @@ -68,4 +68,4 @@ platforms: attributes: cluster: base_os: rhel8 - kernel_release: '4.18.0-477.13.1.el8_7.x86_64' # Use 477 version to match 8.8 kernel version available on docker + kernel_release: '4.18.0-477.13.1.el8_7.fake-value' # Use 477 version to match 8.8 kernel version available on docker