Skip to content

Commit

Permalink
infra: add playbook to purge dashboard/monitoring
Browse files Browse the repository at this point in the history
The dashboard/monitoring stack can be deployed via the dashboard_enabled
variable. But there's nothing similar if we can to remove that part only
and keep the ceph cluster up and running.
The current purge playbooks remove everything.

Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1786691

Signed-off-by: Dimitri Savineau <dsavinea@redhat.com>
(cherry picked from commit 8e4ef7d)
  • Loading branch information
dsavineau committed Jul 6, 2021
1 parent d5784c0 commit b3dde31
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 1 deletion.
206 changes: 206 additions & 0 deletions infrastructure-playbooks/purge-dashboard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
---
# This playbook purges the Ceph MGR Dashboard and Monitoring
# (alertmanager/prometheus/grafana/node-exporter) stack.
# It removes: packages, configuration files and ALL THE DATA
#
# Use it like this:
# ansible-playbook purge-dashboard.yml
# Prompts for confirmation to purge, defaults to no and
# doesn't purge anything. yes purges the dashboard and
# monitoring stack.
#
# ansible-playbook -e ireallymeanit=yes|no purge-dashboard.yml
# Overrides the prompt using -e option. Can be used in
# automation scripts to avoid interactive prompt.

- name: confirm whether user really meant to purge the dashboard
hosts: localhost
gather_facts: false
vars_prompt:
- name: ireallymeanit
prompt: Are you sure you want to purge the dashboard?
default: 'no'
private: no
tasks:
- name: exit playbook, if user did not mean to purge dashboard
fail:
msg: >
"Exiting purge-dashboard playbook, dashboard was NOT purged.
To purge the dashboard, either say 'yes' on the prompt or
or use `-e ireallymeanit=yes` on the command line when
invoking the playbook"
when: ireallymeanit != 'yes'

- name: gather facts on all hosts
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ rbdmirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ grafana_server_group_name | default('grafana-server') }}"
become: true
tasks:
- debug: msg="gather facts on all Ceph hosts for following reference"

- name: purge node exporter
hosts:
- "{{ mon_group_name|default('mons') }}"
- "{{ osd_group_name|default('osds') }}"
- "{{ mds_group_name|default('mdss') }}"
- "{{ rgw_group_name|default('rgws') }}"
- "{{ rbdmirror_group_name|default('rbdmirrors') }}"
- "{{ nfs_group_name|default('nfss') }}"
- "{{ client_group_name|default('clients') }}"
- "{{ mgr_group_name|default('mgrs') }}"
- "{{ grafana_server_group_name | default('grafana-server') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults

- import_role:
name: ceph-facts
tasks_from: container_binary

- name: disable node_exporter service
service:
name: node_exporter
state: stopped
enabled: no
failed_when: false

- name: remove node_exporter service file
file:
name: /etc/systemd/system/node_exporter.service
state: absent

- name: remove node-exporter image
command: "{{ container_binary }} rmi {{ node_exporter_container_image }}"
changed_when: false
failed_when: false

- name: purge ceph monitoring
hosts: "{{ grafana_server_group_name | default('grafana-server') }}"
gather_facts: false
become: true
tasks:
- import_role:
name: ceph-defaults

- import_role:
name: ceph-facts
tasks_from: container_binary

- name: stop services
service:
name: "{{ item }}"
state: stopped
enabled: no
failed_when: false
loop:
- alertmanager
- prometheus
- grafana-server

- name: remove systemd service files
file:
name: "/etc/systemd/system/{{ item }}.service"
state: absent
loop:
- alertmanager
- prometheus
- grafana-server

- name: remove ceph dashboard container images
command: "{{ container_binary }} rmi {{ item }}"
loop:
- "{{ alertmanager_container_image }}"
- "{{ prometheus_container_image }}"
- "{{ grafana_container_image }}"
changed_when: false
failed_when: false

- name: remove ceph-grafana-dashboards package on RedHat or SUSE
package:
name: ceph-grafana-dashboards
state: absent
when:
- not containerized_deployment | bool
- ansible_facts['os_family'] in ['RedHat', 'Suse']

- name: remove data
file:
name: "{{ item }}"
state: absent
loop:
- "{{ alertmanager_conf_dir }}"
- "{{ prometheus_conf_dir }}"
- /etc/grafana
- "{{ alertmanager_data_dir }}"
- "{{ prometheus_data_dir }}"
- /var/lib/grafana

- name: purge ceph dashboard
hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}"
gather_facts: false
become: true
environment:
CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
CEPH_CONTAINER_BINARY: "{{ container_binary }}"
tasks:
- import_role:
name: ceph-defaults

- import_role:
name: ceph-facts
tasks_from: container_binary

- name: set_fact ceph_cmd
set_fact:
ceph_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}"

- name: remove the dashboard admin user
ceph_dashboard_user:
name: "{{ dashboard_admin_user }}"
cluster: "{{ cluster }}"
state: absent
run_once: true
delegate_to: "{{ groups[mon_group_name][0] }}"

- name: remove radosgw system user
radosgw_user:
name: "{{ dashboard_rgw_api_user_id }}"
cluster: "{{ cluster }}"
state: absent
run_once: true
delegate_to: "{{ groups[mon_group_name][0] }}"
when: groups.get(rgw_group_name, []) | length > 0

- name: disable mgr dashboard and prometheus modules
command: "{{ ceph_cmd }} --cluster {{ cluster }} mgr module disable {{ item }}"
delegate_to: "{{ groups[mon_group_name][0] }}"
run_once: true
changed_when: false
loop:
- dashboard
- prometheus

- name: remove TLS certificate and key files
file:
name: "/etc/ceph/ceph-dashboard.{{ item }}"
state: absent
loop:
- crt
- key
when: dashboard_protocol == "https"

- name: remove ceph-mgr-dashboard package
package:
name: ceph-mgr-dashboard
state: absent
when: not containerized_deployment | bool
23 changes: 22 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt}
envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt,purge_dashboard}
centos-non_container-{switch_to_containers}
infra_lv_create
migrate_ceph_disk_to_ceph_volume
Expand Down Expand Up @@ -133,6 +133,25 @@ commands=
# test that the cluster can be redeployed in a healthy state
py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests

[purge-dashboard]
commands=
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/purge-dashboard.yml --extra-vars "\
ireallymeanit=yes \
ceph_docker_registry={env:CEPH_DOCKER_REGISTRY:quay.ceph.io} \
ceph_docker_image={env:CEPH_DOCKER_IMAGE:ceph-ci/daemon} \
ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG:latest-octopus} \
"

# set up the cluster again
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars @ceph-override.json --extra-vars "\
ceph_stable_release={env:CEPH_STABLE_RELEASE:octopus} \
ceph_docker_registry_auth=True \
ceph_docker_registry_username={env:DOCKER_HUB_USERNAME} \
ceph_docker_registry_password={env:DOCKER_HUB_PASSWORD} \
"
# test that the cluster can be redeployed in a healthy state
py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests

[purge-lvm]
commands=
ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/{env:PURGE_PLAYBOOK:purge-cluster.yml} --extra-vars "\
Expand Down Expand Up @@ -365,6 +384,7 @@ changedir=
# tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker
collocation: {toxinidir}/tests/functional/collocation{env:CONTAINER_DIR:}
purge: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
purge_dashboard: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
switch_to_containers: {toxinidir}/tests/functional/all_daemons
lvm_osds: {toxinidir}/tests/functional/lvm-osds{env:CONTAINER_DIR:}
lvm_batch: {toxinidir}/tests/functional/lvm-batch{env:CONTAINER_DIR:}
Expand Down Expand Up @@ -416,6 +436,7 @@ commands=
all_daemons,all_in_one,collocation: ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars "delegate_facts_host={env:DELEGATE_FACTS_HOST:True} ceph_stable_release={env:CEPH_STABLE_RELEASE:octopus} ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG_BIS:latest-bis-octopus}" --extra-vars @ceph-override.json

purge: {[purge]commands}
purge_dashboard: {[purge-dashboard]commands}
switch_to_containers: {[switch-to-containers]commands}
shrink_mon: {[shrink-mon]commands}
shrink_osd: {[shrink-osd]commands}
Expand Down

0 comments on commit b3dde31

Please sign in to comment.