Skip to content

Commit

Permalink
ceph-crash: introduce new role ceph-crash
Browse files Browse the repository at this point in the history
This commit introduces a new role `ceph-crash` in order to deploy
everything needed for the ceph-crash daemon.

Signed-off-by: Guillaume Abrioux <gabrioux@redhat.com>
(cherry picked from commit 9d2f210)
  • Loading branch information
guits authored and dsavineau committed Jul 22, 2020
1 parent 0b5a264 commit e6059fd
Show file tree
Hide file tree
Showing 19 changed files with 395 additions and 1 deletion.
10 changes: 10 additions & 0 deletions infrastructure-playbooks/docker-to-podman.yml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,16 @@
tasks_from: systemd.yml
when: inventory_hostname in groups.get(rgw_group_name, [])

- import_role:
name: ceph-crash
tasks_from: systemd.yml
when: inventory_hostname in groups.get(mon_group_name, []) or
inventory_hostname in groups.get(osd_group_name, []) or
inventory_hostname in groups.get(mds_group_name, []) or
inventory_hostname in groups.get(rgw_group_name, []) or
inventory_hostname in groups.get(mgr_group_name, []) or
inventory_hostname in groups.get(rbdmirror_group_name, [])

- name: dashboard configuration
when: dashboard_enabled | bool
block:
Expand Down
23 changes: 23 additions & 0 deletions infrastructure-playbooks/purge-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,29 @@
- /var/lib/ceph/bootstrap-mgr
- /var/lib/ceph/tmp

- name: purge ceph-crash daemons
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- name: stop ceph-crash service
service:
name: ceph-crash.service
state: stopped
enabled: no
failed_when: false

- name: remove /var/lib/ceph/crash
file:
path: /var/lib/ceph/crash
state: absent


- name: final cleanup - check any running ceph, purge ceph packages, purge config and remove data

Expand Down
29 changes: 29 additions & 0 deletions infrastructure-playbooks/purge-container-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,35 @@
failed_when: false
when: dashboard_enabled | bool

- name: purge ceph-crash containers
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- name: stop ceph-crash container
service:
name: "ceph-crash@{{ ansible_hostname }}"
state: stopped
enabled: no
failed_when: false

- name: remove service file
file:
name: "/etc/systemd/system/ceph-crash.service"
state: absent
failed_when: false

- name: remove /var/lib/ceph/crash
file:
path: /var/lib/ceph/crash
state: absent

- name: check container hosts

hosts:
Expand Down
21 changes: 21 additions & 0 deletions infrastructure-playbooks/rolling_update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,27 @@
- import_role:
name: ceph-client

- name: upgrade ceph-crash daemons
hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults
- import_role:
name: ceph-facts
tasks_from: container_binary.yml
- import_role:
name: ceph-handler
- import_role:
name: ceph-crash

- name: complete upgrade
hosts:
- "{{ mon_group_name | default('mons') }}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,37 @@

- import_role:
name: ceph-nfs

- name: switching from non-containerized to containerized ceph-crash

hosts:
- "{{ mon_group_name | default('mons') }}"
- "{{ osd_group_name | default('osds') }}"
- "{{ mds_group_name | default('mdss') }}"
- "{{ rgw_group_name | default('rgws') }}"
- "{{ rbdmirror_group_name | default('rbdmirrors') }}"
- "{{ mgr_group_name | default('mgrs') }}"

vars:
containerized_deployment: true
serial: 1
become: true
tasks:
- name: stop non-containerized ceph-crash
service:
name: ceph-crash
state: stopped
enabled: no

- import_role:
name: ceph-defaults

- import_role:
name: ceph-facts
tasks_from: container_binary.yml

- import_role:
name: ceph-handler

- import_role:
name: ceph-crash
29 changes: 29 additions & 0 deletions roles/ceph-container-common/tasks/fetch_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@
- ceph_nfs_container_stat.get('rc') == 0
- ceph_nfs_container_stat.get('stdout_lines', [])|length != 0

- name: inspect ceph crash container
command: "{{ container_binary }} inspect {{ ceph_crash_container_stat.stdout }}"
changed_when: false
register: ceph_crash_inspect
when:
- ceph_crash_container_stat.get('rc') == 0
- ceph_crash_container_stat.get('stdout_lines', [])|length != 0

# NOTE(leseb): using failed_when to handle the case when the image is not present yet
- name: "inspecting ceph mon container image before pulling"
command: "{{ container_binary }} inspect {{ (ceph_mon_inspect.stdout | from_json)[0].Image }}"
Expand Down Expand Up @@ -127,6 +135,13 @@
- nfs_group_name in group_names
- ceph_nfs_inspect.get('rc') == 0

- name: "inspecting ceph crash container image before pulling"
command: "{{ container_binary }} inspect {{ (ceph_crash_inspect.stdout | from_json)[0].Image }}"
changed_when: false
failed_when: false
register: ceph_crash_container_inspect_before_pull
when: ceph_crash_inspect.get('rc') == 0

- name: set_fact ceph_mon_image_repodigest_before_pulling
set_fact:
ceph_mon_image_repodigest_before_pulling: "{{ (ceph_mon_container_inspect_before_pull.stdout | from_json)[0].Id }}"
Expand Down Expand Up @@ -162,6 +177,11 @@
- mgr_group_name in group_names
- ceph_mgr_container_inspect_before_pull.get('rc') == 0

- name: set_fact ceph_crash_image_repodigest_before_pulling
set_fact:
ceph_crash_image_repodigest_before_pulling: "{{ (ceph_crash_container_inspect_before_pull.stdout | from_json)[0].Id }}"
when: ceph_crash_container_inspect_before_pull.get('rc') == 0

- name: set_fact ceph_rbd_mirror_image_repodigest_before_pulling
set_fact:
ceph_rbd_mirror_image_repodigest_before_pulling: "{{ (ceph_rbd_mirror_container_inspect_before_pull.stdout | from_json)[0].Id }}"
Expand Down Expand Up @@ -266,6 +286,15 @@
- ceph_nfs_container_inspect_before_pull.get('rc') == 0
- ceph_nfs_image_repodigest_before_pulling != image_repodigest_after_pulling

- name: set_fact ceph_crash_image_updated
set_fact:
ceph_crash_image_updated: "{{ ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling }}"
changed_when: true
notify: restart ceph crash
when:
- ceph_crash_container_inspect_before_pull.get('rc') == 0
- ceph_crash_image_repodigest_before_pulling != image_repodigest_after_pulling

- name: export local ceph dev image
command: >
{{ container_binary }} save -o "/tmp/{{ ceph_docker_username }}-{{ ceph_docker_imagename }}-{{ ceph_docker_image_tag }}.tar"
Expand Down
15 changes: 15 additions & 0 deletions roles/ceph-crash/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
galaxy_info:
company: Red Hat
author: Guillaume Abrioux
description: Deploy ceph-crash
license: Apache
min_ansible_version: 2.7
platforms:
- name: EL
versions:
- 7
- 8
galaxy_tags:
- system
dependencies: []
71 changes: 71 additions & 0 deletions roles/ceph-crash/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
---
- name: create and copy client.crash keyring
when: cephx | bool
block:
- name: create client.crash keyring
ceph_key:
state: present
name: "client.crash"
caps: "{{ {'mon': 'allow profile crash', 'mgr': 'allow profile crash'} }}"
cluster: "{{ cluster }}"
dest: "{{ ceph_conf_key_directory }}"
import_key: True
mode: "{{ ceph_keyring_permissions }}"
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
delegate_to: "{{ groups.get(mon_group_name, [])[0] }}"
run_once: True

- name: get keys from monitors
command: "{{ hostvars[groups[mon_group_name][0]]['container_exec_cmd'] | default('') }} ceph --cluster {{ cluster }} auth get client.crash"
register: _crash_keys
delegate_to: "{{ groups.get(mon_group_name)[0] }}"
run_once: true

- name: get a list of node where the keyring should be copied
set_fact:
list_target_node: "{{ list_target_node | default([]) | union(((groups.get('all') | difference(groups.get(grafana_server_group_name, []) + groups.get(client_group_name, []) + groups.get(nfs_group_name, []) + groups.get(iscsi_gw_group_name, []))) + groups.get(item, [])) | unique) }}"
run_once: True
with_items:
- "{{ mon_group_name if groups.get(mon_group_name, []) | length > 0 else [] }}"
- "{{ osd_group_name if groups.get(osd_group_name, []) | length > 0 else [] }}"
- "{{ mds_group_name if groups.get(mds_group_name, []) | length > 0 else [] }}"
- "{{ rgw_group_name if groups.get(rgw_group_name, []) | length > 0 else [] }}"
- "{{ rbdmirror_group_name if groups.get(rbdmirror_group_name, []) | length > 0 else [] }}"
- "{{ mgr_group_name if groups.get(mgr_group_name, []) | length > 0 else [] }}"

- name: copy ceph key(s) if needed
copy:
dest: "{{ ceph_conf_key_directory }}/{{ cluster }}.client.crash.keyring"
content: "{{ _crash_keys.stdout + '\n' }}"
owner: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
group: "{{ ceph_uid if containerized_deployment else 'ceph' }}"
mode: "{{ ceph_keyring_permissions }}"
with_items: "{{ list_target_node }}"
delegate_to: "{{ item }}"
run_once: True

- name: start ceph-crash daemon
when: containerized_deployment | bool
block:
- name: create /var/lib/ceph/crash/posted
file:
path: /var/lib/ceph/crash/posted
state: directory
mode: '0755'
owner: "{{ ceph_uid }}"
group: "{{ ceph_uid }}"

- name: include_tasks systemd.yml
include_tasks: systemd.yml

- name: start the ceph-crash service
systemd:
name: "{{ 'ceph-crash@' + ansible_hostname if containerized_deployment | bool else 'ceph-crash.service' }}"
state: started
enabled: yes
masked: no
daemon_reload: yes
9 changes: 9 additions & 0 deletions roles/ceph-crash/tasks/systemd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
- name: generate systemd unit file for ceph-crash container
template:
src: "{{ role_path }}/templates/ceph-crash.service.j2"
dest: /etc/systemd/system/ceph-crash@.service
owner: "root"
group: "root"
mode: "0644"
notify: restart ceph crash
41 changes: 41 additions & 0 deletions roles/ceph-crash/templates/ceph-crash.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[Unit]
Description=Ceph crash dump collector
{% if container_binary == 'docker' %}
After=docker.service
Requires=docker.service
{% else %}
After=network.target
{% endif %}

[Service]
{% if container_binary == 'podman' %}
ExecStartPre=-/usr/bin/rm -f /%t/%n-pid /%t/%n-cid
ExecStartPre=-/usr/bin/{{ container_binary }} rm -f ceph-crash-%i
{% endif %}
ExecStart=/usr/bin/{{ container_binary }} run --rm --name ceph-crash-%i \
{% if container_binary == 'podman' %}
-d --conmon-pidfile /%t/%n-pid --cidfile /%t/%n-cid \
{% endif %}
--net=host \
-v /var/lib/ceph:/var/lib/ceph:z \
-v /etc/localtime:/etc/localtime:ro \
--entrypoint=/usr/bin/ceph-crash {{ ceph_docker_registry }}/{{ ceph_docker_image }}:{{ ceph_docker_image_tag }}
{% if container_binary == 'podman' %}
ExecStop=-/usr/bin/sh -c "/usr/bin/{{ container_binary }} rm -f `cat /%t/%n-cid`"
{% else %}
ExecStop=-/usr/bin/{{ container_binary }} stop ceph-crash-%i
{% endif %}
StartLimitInterval=10min
StartLimitBurst=30
{% if container_binary == 'podman' %}
Type=forking
PIDFile=/%t/%n-pid
{% endif %}
KillMode=none
Restart=always
RestartSec=10s
TimeoutStartSec=120
TimeoutStopSec=10

[Install]
WantedBy=multi-user.target
4 changes: 4 additions & 0 deletions roles/ceph-handler/handlers/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@
include_tasks: handler_rbd_target_api_gw.yml
when: iscsi_gw_group_name in group_names
listen: "restart ceph rbd-target-api-gw"

- name: ceph crash handler
include_tasks: handler_crash.yml
listen: "restart ceph crash"
7 changes: 7 additions & 0 deletions roles/ceph-handler/tasks/check_running_containers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,10 @@
failed_when: false
check_mode: no
when: inventory_hostname in groups.get(iscsi_gw_group_name, [])

- name: check for a ceph-crash container
command: "{{ container_binary }} ps -q --filter='name=ceph-crash-{{ ansible_hostname }}'"
register: ceph_crash_container_stat
changed_when: false
failed_when: false
check_mode: no
7 changes: 7 additions & 0 deletions roles/ceph-handler/tasks/check_socket_non_container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,10 @@
failed_when: false
check_mode: no
when: inventory_hostname in groups.get(iscsi_gw_group_name, [])

- name: check for a ceph-crash process
command: pgrep ceph-crash
changed_when: false
failed_when: false
check_mode: no
register: crash_process
18 changes: 18 additions & 0 deletions roles/ceph-handler/tasks/handler_crash.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
- name: set _crash_handler_called before restart
set_fact:
_crash_handler_called: True

- name: restart the ceph-crash service
systemd:
name: ceph-crash@{{ ansible_hostname }}
state: restarted
enabled: yes
masked: no
daemon_reload: yes
ignore_errors: true
when: hostvars[inventory_hostname]['_crash_handler_called'] | default(False) | bool

- name: set _crash_handler_called after restart
set_fact:
_crash_handler_called: False

0 comments on commit e6059fd

Please sign in to comment.