Skip to content

Commit

Permalink
rolling_update: unmask monitor service after a failure
Browse files Browse the repository at this point in the history
if for some reason the playbook fails after the service was
stopped, disabled and masked and before it got restarted, enabled and
unmasked, the playbook leaves the service masked and which can make users
confused and forces them to unmask the unit manually.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1917680

Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
(cherry picked from commit 07029e1)
  • Loading branch information
guits committed Mar 29, 2021
1 parent 653d180 commit 82b934c
Showing 1 changed file with 153 additions and 133 deletions.
286 changes: 153 additions & 133 deletions infrastructure-playbooks/rolling_update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,150 +118,170 @@
serial: 1
become: True
tasks:
- name: remove ceph aliases
file:
path: /etc/profile.d/ceph-aliases.sh
state: absent
when: containerized_deployment | bool

- name: set mon_host_count
set_fact:
mon_host_count: "{{ groups[mon_group_name] | length }}"

- name: fail when less than three monitors
fail:
msg: "Upgrade of cluster with less than three monitors is not supported."
when: mon_host_count | int < 3

- name: select a running monitor
set_fact:
mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"
- name: upgrade ceph mon cluster
block:
- name: upgrade ceph mon cluster
block:
- name: remove ceph aliases
file:
path: /etc/profile.d/ceph-aliases.sh
state: absent
when: containerized_deployment | bool

- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
- name: set mon_host_count
set_fact:
mon_host_count: "{{ groups[mon_group_name] | length }}"

- block:
- name: get ceph cluster status
command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health -f json"
register: check_cluster_health
delegate_to: "{{ mon_host }}"
- name: fail when less than three monitors
fail:
msg: "Upgrade of cluster with less than three monitors is not supported."
when: mon_host_count | int < 3

- block:
- name: display ceph health detail
command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health detail"
delegate_to: "{{ mon_host }}"
- name: select a running monitor
set_fact:
mon_host: "{{ groups[mon_group_name] | difference([inventory_hostname]) | last }}"

- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts

- block:
- name: get ceph cluster status
command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health -f json"
register: check_cluster_health
delegate_to: "{{ mon_host }}"

- block:
- name: display ceph health detail
command: "{{ container_exec_cmd }} ceph --cluster {{ cluster }} health detail"
delegate_to: "{{ mon_host }}"

- name: fail if cluster isn't in an acceptable state
fail:
msg: "cluster is not in an acceptable state!"
when: (check_cluster_health.stdout | from_json).status == 'HEALTH_ERR'
when: inventory_hostname == groups[mon_group_name] | first

- name: ensure /var/lib/ceph/bootstrap-rbd-mirror is present
file:
path: /var/lib/ceph/bootstrap-rbd-mirror
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
mode: '755'
state: directory
delegate_to: "{{ item }}"
with_items: "{{ groups[mon_group_name] }}"
when:
- cephx | bool
- inventory_hostname == groups[mon_group_name][0]

- name: create potentially missing keys (rbd and rbd-mirror)
ceph_key:
name: "client.{{ item.0 }}"
dest: "/var/lib/ceph/{{ item.0 }}/"
caps:
mon: "allow profile {{ item.0 }}"
cluster: "{{ cluster }}"
delegate_to: "{{ item.1 }}"
with_nested:
- ['bootstrap-rbd', 'bootstrap-rbd-mirror']
- "{{ groups[mon_group_name] }}" # so the key goes on all the nodes
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when:
- cephx | bool
- inventory_hostname == groups[mon_group_name][0]

- name: fail if cluster isn't in an acceptable state
fail:
msg: "cluster is not in an acceptable state!"
when: (check_cluster_health.stdout | from_json).status == 'HEALTH_ERR'
when: inventory_hostname == groups[mon_group_name] | first

- name: ensure /var/lib/ceph/bootstrap-rbd-mirror is present
file:
path: /var/lib/ceph/bootstrap-rbd-mirror
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
mode: '755'
state: directory
delegate_to: "{{ item }}"
with_items: "{{ groups[mon_group_name] }}"
when:
- cephx | bool
- inventory_hostname == groups[mon_group_name][0]

- name: create potentially missing keys (rbd and rbd-mirror)
ceph_key:
name: "client.{{ item.0 }}"
dest: "/var/lib/ceph/{{ item.0 }}/"
caps:
mon: "allow profile {{ item.0 }}"
cluster: "{{ cluster }}"
delegate_to: "{{ item.1 }}"
with_nested:
- ['bootstrap-rbd', 'bootstrap-rbd-mirror']
- "{{ groups[mon_group_name] }}" # so the key goes on all the nodes
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
when:
- cephx | bool
- inventory_hostname == groups[mon_group_name][0]
# NOTE: we mask the service so the RPM can't restart it
# after the package gets upgraded
- name: stop ceph mon - shortname
systemd:
name: ceph-mon@{{ ansible_facts['hostname'] }}
state: stopped
enabled: no
masked: yes
ignore_errors: True

# NOTE: we mask the service so the RPM can't restart it
# after the package gets upgraded
- name: stop ceph mon - shortname
systemd:
name: ceph-mon@{{ ansible_facts['hostname'] }}
state: stopped
enabled: no
masked: yes
ignore_errors: True
# NOTE: we mask the service so the RPM can't restart it
# after the package gets upgraded
- name: stop ceph mon - fqdn
systemd:
name: ceph-mon@{{ ansible_facts['fqdn'] }}
state: stopped
enabled: no
masked: yes
ignore_errors: True

# NOTE: we mask the service so the RPM can't restart it
# after the package gets upgraded
- name: stop ceph mon - fqdn
systemd:
name: ceph-mon@{{ ansible_facts['fqdn'] }}
state: stopped
enabled: no
masked: yes
ignore_errors: True
# only mask the service for mgr because it must be upgraded
# after ALL monitors, even when collocated
- name: mask the mgr service
systemd:
name: ceph-mgr@{{ ansible_facts['hostname'] }}
masked: yes
when: inventory_hostname in groups[mgr_group_name] | default([])
or groups[mgr_group_name] | default([]) | length == 0

# only mask the service for mgr because it must be upgraded
# after ALL monitors, even when collocated
- name: mask the mgr service
systemd:
name: ceph-mgr@{{ ansible_facts['hostname'] }}
masked: yes
when: inventory_hostname in groups[mgr_group_name] | default([])
or groups[mgr_group_name] | default([]) | length == 0
- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment | bool
- import_role:
name: ceph-container-common
when: containerized_deployment | bool
- import_role:
name: ceph-config
- import_role:
name: ceph-mon

- import_role:
name: ceph-handler
- import_role:
name: ceph-common
when: not containerized_deployment | bool
- import_role:
name: ceph-container-common
when: containerized_deployment | bool
- import_role:
name: ceph-config
- import_role:
name: ceph-mon
- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_facts['hostname'] }}
state: started
enabled: yes
ignore_errors: True # if no mgr collocated with mons

- name: non container | waiting for the monitor to join the quorum...
command: ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
register: ceph_health_raw
until:
- ceph_health_raw.rc == 0
- (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
when: not containerized_deployment | bool

- name: start ceph mgr
systemd:
name: ceph-mgr@{{ ansible_facts['hostname'] }}
state: started
enabled: yes
ignore_errors: True # if no mgr collocated with mons

- name: non container | waiting for the monitor to join the quorum...
command: ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
register: ceph_health_raw
until:
- ceph_health_raw.rc == 0
- (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
when: not containerized_deployment | bool
- name: container | waiting for the containerized monitor to join the quorum...
command: >
{{ container_binary }} exec ceph-mon-{{ ansible_facts['hostname'] }} ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
register: ceph_health_raw
until:
- ceph_health_raw.rc == 0
- (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
when: containerized_deployment | bool

- name: container | waiting for the containerized monitor to join the quorum...
command: >
{{ container_binary }} exec ceph-mon-{{ ansible_facts['hostname'] }} ceph --cluster "{{ cluster }}" -m "{{ hostvars[groups[mon_group_name][0]]['_current_monitor_address'] }}" quorum_status --format json
register: ceph_health_raw
until:
- ceph_health_raw.rc == 0
- (hostvars[inventory_hostname]['ansible_facts']['hostname'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"] or
hostvars[inventory_hostname]['ansible_facts']['fqdn'] in (ceph_health_raw.stdout | default('{}') | from_json)["quorum_names"])
retries: "{{ health_mon_check_retries }}"
delay: "{{ health_mon_check_delay }}"
when: containerized_deployment | bool
rescue:
- name: unmask the mon service
systemd:
name: ceph-mon@{{ item }}
enabled: yes
masked: no
with_items:
- "{{ ansible_facts['hostname'] }}"
- "{{ ansible_facts['fqdn'] }}"

- name: unmask the mgr service
systemd:
name: ceph-mgr@{{ ansible_facts['hostname'] }}
masked: no
when: inventory_hostname in groups[mgr_group_name] | default([])
or groups[mgr_group_name] | default([]) | length == 0

- name: reset mon_host
hosts: "{{ mon_group_name|default('mons') }}"
Expand Down

0 comments on commit 82b934c

Please sign in to comment.