Skip to content

Commit

Permalink
common: serialise host restart
Browse files Browse the repository at this point in the history
This commits allows us to restart Ceph daemon machine by machine instead
of restarting all the daemons in a single shot.

Rework the structure of the handler for clarity as well.

Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Jan 31, 2017
1 parent efc49e2 commit 40a2df5
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 27 deletions.
14 changes: 14 additions & 0 deletions group_vars/all.yml.sample
Expand Up @@ -339,6 +339,20 @@ dummy:
# if you don't want it keep the option commented
#common_single_host_mode: true

## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30

###################
# CONFIG OVERRIDE #
Expand Down
14 changes: 14 additions & 0 deletions roles/ceph-common/defaults/main.yml
Expand Up @@ -331,6 +331,20 @@ restapi_port: 5000
# if you don't want it keep the option commented
#common_single_host_mode: true

## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10
handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30

###################
# CONFIG OVERRIDE #
Expand Down
32 changes: 5 additions & 27 deletions roles/ceph-common/handlers/main.yml
Expand Up @@ -2,41 +2,19 @@
- name: update apt cache
apt:
update-cache: yes
when: ansible_os_family == 'Debian'

- name: restart ceph mons
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
when:
- socket.rc == 0
- mon_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"

# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds
service:
name: ceph.target
state: restarted
when:
- socket.rc == 0
- osd_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"

- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
when:
- socket.rc == 0
- mds_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"

- name: restart ceph rgws
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
when:
- socketrgw.rc == 0
- rgw_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"

- name: restart ceph nfss
service:
Expand Down
13 changes: 13 additions & 0 deletions roles/ceph-common/handlers/restart-mds.yml
@@ -0,0 +1,13 @@
---
- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mds_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mds_group_name in group_names
17 changes: 17 additions & 0 deletions roles/ceph-common/handlers/restart-mon.yml
@@ -0,0 +1,17 @@
---
- name: restart ceph mons
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mon_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mon_group_name in group_names

- name: validate monitors
include: validate-mon.yml
when: mon_group_name in group_names
22 changes: 22 additions & 0 deletions roles/ceph-common/handlers/restart-osd.yml
@@ -0,0 +1,22 @@
---
# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds
shell: |
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
systemctl restart ceph-osd@$id
sleep 5
done
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[osd_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- osd_group_name in group_names

- name: validate osds
include: validate-osd.yml
when: osd_group_name in group_names
13 changes: 13 additions & 0 deletions roles/ceph-common/handlers/restart-rgw.yml
@@ -0,0 +1,13 @@
---
- name: restart ceph rgws
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[rgw_group_name] }}"
delegate_to: "{{ item }}"
when:
- socketrgw.rc == 0
- rgw_group_name in group_names
28 changes: 28 additions & 0 deletions roles/ceph-common/handlers/validate-mon.yml
@@ -0,0 +1,28 @@
---
- name: wait for ceph monitor socket
wait_for:
path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"

- name: set mon_host_count
set_fact: mon_host_count={{ groups[mon_group_name] | length }}

- name: select a running monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name] }}"
when:
- item != inventory_hostname
- mon_host_count | int > 1

- name: select first monitor if only one monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name][0] }}"
when: mon_host_count | int == 1

- name: waiting for the monitor to join the quorum...
shell: |
ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
register: result
until: result.rc == 0
retries: "{{ handler_health_mon_check_retries }}"
delay: "{{ handler_health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
19 changes: 19 additions & 0 deletions roles/ceph-common/handlers/validate-osd.yml
@@ -0,0 +1,19 @@
---
- name: collect osds
shell: |
ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
register: osd_ids

- name: wait for ceph osd socket(s)
wait_for:
path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
with_items: "{{ osd_ids.stdout_lines }}"

- name: waiting for clean pgs...
shell: |
test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
register: result
until: result.rc == 0
retries: "{{ handler_health_osd_check_retries }}"
delay: "{{ handler_health_osd_check_delay }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
1 change: 1 addition & 0 deletions roles/ceph-common/tasks/generate_ceph_conf.yml
Expand Up @@ -22,3 +22,4 @@
- restart ceph osds
- restart ceph mdss
- restart ceph rgws
- restart ceph nfss

0 comments on commit 40a2df5

Please sign in to comment.