Skip to content

Commit

Permalink
common: serialise host restart
Browse files Browse the repository at this point in the history
This commits allows us to restart Ceph daemon machine by machine instead
of restarting all the daemons in a single shot.

Rework the structure of the handler for clarity as well.

Signed-off-by: Sébastien Han <seb@redhat.com>
(cherry picked from commit 40a2df5)

Conflicts:
	roles/ceph-common/handlers/main.yml
	roles/ceph-common/tasks/generate_ceph_conf.yml
  • Loading branch information
leseb authored and andrewschoen committed Jan 31, 2017
1 parent 7cfa152 commit ce7ad22
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 120 deletions.
14 changes: 14 additions & 0 deletions group_vars/all.yml.sample
Expand Up @@ -316,6 +316,20 @@ dummy:
# if you don't want it keep the option commented
#common_single_host_mode: true

## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
#handler_health_mon_check_retries: 5
#handler_health_mon_check_delay: 10
#handler_health_osd_check_retries: 40
#handler_health_osd_check_delay: 30

###################
# CONFIG OVERRIDE #
Expand Down
14 changes: 14 additions & 0 deletions roles/ceph-common/defaults/main.yml
Expand Up @@ -308,6 +308,20 @@ restapi_port: 5000
# if you don't want it keep the option commented
#common_single_host_mode: true

## Handlers - restarting daemons after a config change
# if for whatever reasons the content of your ceph configuration changes
# ceph daemons will be restarted as well. At the moment, we can not detect
# which config option changed so all the daemons will be restarted. Although
# this restart will be serialized for each node, in between a health check
# will be performed so we make sure we don't move to the next node until
# ceph is not healthy
# Obviously between the checks (for monitors to be in quorum and for osd's pgs
# to be clean) we have to wait. These retries and delays can be configurable
# for both monitors and osds.
handler_health_mon_check_retries: 5
handler_health_mon_check_delay: 10
handler_health_osd_check_retries: 40
handler_health_osd_check_delay: 30

###################
# CONFIG OVERRIDE #
Expand Down
116 changes: 5 additions & 111 deletions roles/ceph-common/handlers/main.yml
Expand Up @@ -2,125 +2,19 @@
- name: update apt cache
apt:
update-cache: yes
when: ansible_os_family == 'Debian'

- name: restart ceph mons
command: service ceph restart mon
when:
- socket.rc == 0
- ansible_distribution != 'Ubuntu'
- mon_group_name in group_names
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis

- name: restart ceph mons with systemd
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
when:
- socket.rc == 0
- use_systemd
- mon_group_name in group_names
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer

- name: restart ceph mons on ubuntu
command: initctl restart ceph-mon cluster={{ cluster }} id={{ monitor_name }}
when:
- socket.rc == 0
- ansible_distribution == 'Ubuntu'
- not use_systemd
- mon_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mon.yml"

- name: restart ceph osds
command: service ceph restart osd
when:
- socket.rc == 0
- ansible_distribution != 'Ubuntu'
- osd_group_name in group_names
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis

# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds with systemd
service:
name: ceph.target
state: restarted
when:
- socket.rc == 0
- use_systemd
- osd_group_name in group_names
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer

- name: restart ceph osds on ubuntu
shell: |
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
initctl restart ceph-osd cluster={{ cluster }} id=$id
done
when:
- socket.rc == 0
- ansible_distribution == 'Ubuntu'
- not use_systemd
- osd_group_name in group_names

- name: restart ceph mdss on ubuntu
command: initctl restart ceph-mds cluster={{ cluster }} id={{ ansible_hostname }}
when:
- socket.rc == 0
- ansible_distribution == 'Ubuntu'
- not use_systemd
- mds_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-osd.yml"

- name: restart ceph mdss
command: service ceph restart mds
when:
- socket.rc == 0
- ansible_distribution != 'Ubuntu'
- use_systemd
- mds_group_name in group_names
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis

- name: restart ceph mdss with systemd
service:
name: ceph-mds@{{ mds_name }}
state: restarted
when:
- socket.rc == 0
- use_systemd
- mds_group_name in group_names
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer

- name: restart ceph rgws on ubuntu
command: initctl restart radosgw cluster={{ cluster }} id=rgw.{{ ansible_hostname }}
when:
- socketrgw.rc == 0
- ansible_distribution == 'Ubuntu'
- not use_systemd
- rgw_group_name in group_names
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-mds.yml"

- name: restart ceph rgws
command: /etc/init.d/radosgw restart
when:
- socketrgw.rc == 0
- ansible_distribution != 'Ubuntu'
- rgw_group_name in group_names
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis

- name: restart ceph rgws on red hat
command: /etc/init.d/ceph-radosgw restart
when:
- socketrgw.rc == 0
- ansible_os_family == 'RedHat'
- rgw_group_name in group_names
- ceph_release_num.{{ ceph_release }} < ceph_release_num.infernalis

- name: restart ceph rgws with systemd
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
when:
- socketrgw.rc == 0
- use_systemd
- rgw_group_name in group_names
- ceph_release_num.{{ ceph_release }} > ceph_release_num.hammer
include: "{{ playbook_dir }}/roles/ceph-common/handlers/restart-rgw.yml"

- name: restart ceph nfss
service:
Expand Down
13 changes: 13 additions & 0 deletions roles/ceph-common/handlers/restart-mds.yml
@@ -0,0 +1,13 @@
---
- name: restart ceph mdss
service:
name: ceph-mds@{{ mds_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mds_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mds_group_name in group_names
17 changes: 17 additions & 0 deletions roles/ceph-common/handlers/restart-mon.yml
@@ -0,0 +1,17 @@
---
- name: restart ceph mons
service:
name: ceph-mon@{{ monitor_name }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[mon_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- mon_group_name in group_names

- name: validate monitors
include: validate-mon.yml
when: mon_group_name in group_names
22 changes: 22 additions & 0 deletions roles/ceph-common/handlers/restart-osd.yml
@@ -0,0 +1,22 @@
---
# This does not just restart OSDs but everything else too. Unfortunately
# at this time the ansible role does not have an OSD id list to use
# for restarting them specifically.
- name: restart ceph osds
shell: |
for id in $(ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'); do
systemctl restart ceph-osd@$id
sleep 5
done
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[osd_group_name] }}"
delegate_to: "{{ item }}"
when:
- socket.rc == 0
- osd_group_name in group_names

- name: validate osds
include: validate-osd.yml
when: osd_group_name in group_names
13 changes: 13 additions & 0 deletions roles/ceph-common/handlers/restart-rgw.yml
@@ -0,0 +1,13 @@
---
- name: restart ceph rgws
service:
name: ceph-rgw@{{ ansible_hostname }}
state: restarted
# serial: 1 would be the proper solution here, but that can only be set on play level
# upstream issue: https://github.com/ansible/ansible/issues/12170
run_once: true
with_items: "{{ groups[rgw_group_name] }}"
delegate_to: "{{ item }}"
when:
- socketrgw.rc == 0
- rgw_group_name in group_names
28 changes: 28 additions & 0 deletions roles/ceph-common/handlers/validate-mon.yml
@@ -0,0 +1,28 @@
---
- name: wait for ceph monitor socket
wait_for:
path: "/var/run/ceph/{{ cluster }}-mon.{{ monitor_name }}.asok"

- name: set mon_host_count
set_fact: mon_host_count={{ groups[mon_group_name] | length }}

- name: select a running monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name] }}"
when:
- item != inventory_hostname
- mon_host_count | int > 1

- name: select first monitor if only one monitor
set_fact: mon_host={{ item }}
with_items: "{{ groups[mon_group_name][0] }}"
when: mon_host_count | int == 1

- name: waiting for the monitor to join the quorum...
shell: |
ceph -s --cluster {{ cluster }} | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
register: result
until: result.rc == 0
retries: "{{ handler_health_mon_check_retries }}"
delay: "{{ handler_health_mon_check_delay }}"
delegate_to: "{{ mon_host }}"
19 changes: 19 additions & 0 deletions roles/ceph-common/handlers/validate-osd.yml
@@ -0,0 +1,19 @@
---
- name: collect osds
shell: |
ls /var/lib/ceph/osd/ |grep -oh '[0-9]*'
register: osd_ids

- name: wait for ceph osd socket(s)
wait_for:
path: "/var/run/ceph/{{ cluster }}-osd.{{ item }}.asok"
with_items: "{{ osd_ids.stdout_lines }}"

- name: waiting for clean pgs...
shell: |
test "$(ceph --cluster {{ cluster }} pg stat | sed 's/^.*pgs://;s/active+clean.*//;s/ //')" -eq "$(ceph --cluster {{ cluster }} pg stat | sed 's/pgs.*//;s/^.*://;s/ //')" && ceph --cluster {{ cluster }} health | egrep -sq "HEALTH_OK|HEALTH_WARN"
register: result
until: result.rc == 0
retries: "{{ handler_health_osd_check_retries }}"
delay: "{{ handler_health_osd_check_delay }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
9 changes: 0 additions & 9 deletions roles/ceph-common/tasks/generate_ceph_conf.yml
Expand Up @@ -19,16 +19,7 @@
config_type: ini
notify:
- restart ceph mons
- restart ceph mons on ubuntu
- restart ceph mons with systemd
- restart ceph osds
- restart ceph osds on ubuntu
- restart ceph osds with systemd
- restart ceph mdss
- restart ceph mdss on ubuntu
- restart ceph mdss with systemd
- restart ceph rgws
- restart ceph rgws on ubuntu
- restart ceph rgws on red hat
- restart ceph rgws with systemd
- restart ceph nfss

0 comments on commit ce7ad22

Please sign in to comment.