infra: add playbook to purge dashboard/monitoring

The dashboard/monitoring stack can be deployed via the dashboard_enabled variable. But there's nothing similar if we can to remove that part only and keep the ceph cluster up and running. The current purge playbooks remove everything. Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1786691 Signed-off-by: Dimitri Savineau <dsavinea@redhat.com> (cherry picked from commit 8e4ef7d)
ceph · Jul 6, 2021 · b3dde31 · b3dde31
1 parent d5784c0
commit b3dde31
Show file tree

Hide file tree

Showing 2 changed files with 228 additions and 1 deletion.
diff --git a/infrastructure-playbooks/purge-dashboard.yml b/infrastructure-playbooks/purge-dashboard.yml
@@ -0,0 +1,206 @@
+---
+# This playbook purges the Ceph MGR Dashboard and Monitoring
+# (alertmanager/prometheus/grafana/node-exporter) stack.
+# It removes: packages, configuration files and ALL THE DATA
+#
+# Use it like this:
+# ansible-playbook purge-dashboard.yml
+#     Prompts for confirmation to purge, defaults to no and
+#     doesn't purge anything. yes purges the dashboard and
+#     monitoring stack.
+#
+# ansible-playbook -e ireallymeanit=yes|no purge-dashboard.yml
+#     Overrides the prompt using -e option. Can be used in
+#     automation scripts to avoid interactive prompt.
+
+- name: confirm whether user really meant to purge the dashboard
+  hosts: localhost
+  gather_facts: false
+  vars_prompt:
+    - name: ireallymeanit
+      prompt: Are you sure you want to purge the dashboard?
+      default: 'no'
+      private: no
+  tasks:
+    - name: exit playbook, if user did not mean to purge dashboard
+      fail:
+        msg: >
+          "Exiting purge-dashboard playbook, dashboard was NOT purged.
+           To purge the dashboard, either say 'yes' on the prompt or
+           or use `-e ireallymeanit=yes` on the command line when
+           invoking the playbook"
+      when: ireallymeanit != 'yes'
+
+- name: gather facts on all hosts
+  hosts:
+    - "{{ mon_group_name|default('mons') }}"
+    - "{{ osd_group_name|default('osds') }}"
+    - "{{ mds_group_name|default('mdss') }}"
+    - "{{ rgw_group_name|default('rgws') }}"
+    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
+    - "{{ nfs_group_name|default('nfss') }}"
+    - "{{ client_group_name|default('clients') }}"
+    - "{{ mgr_group_name|default('mgrs') }}"
+    - "{{ grafana_server_group_name | default('grafana-server') }}"
+  become: true
+  tasks:
+    - debug: msg="gather facts on all Ceph hosts for following reference"
+
+- name: purge node exporter
+  hosts:
+    - "{{ mon_group_name|default('mons') }}"
+    - "{{ osd_group_name|default('osds') }}"
+    - "{{ mds_group_name|default('mdss') }}"
+    - "{{ rgw_group_name|default('rgws') }}"
+    - "{{ rbdmirror_group_name|default('rbdmirrors') }}"
+    - "{{ nfs_group_name|default('nfss') }}"
+    - "{{ client_group_name|default('clients') }}"
+    - "{{ mgr_group_name|default('mgrs') }}"
+    - "{{ grafana_server_group_name | default('grafana-server') }}"
+  gather_facts: false
+  become: true
+  tasks:
+    - import_role:
+        name: ceph-defaults
+
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary
+
+    - name: disable node_exporter service
+      service:
+        name: node_exporter
+        state: stopped
+        enabled: no
+      failed_when: false
+
+    - name: remove node_exporter service file
+      file:
+        name: /etc/systemd/system/node_exporter.service
+        state: absent
+
+    - name: remove node-exporter image
+      command: "{{ container_binary }} rmi {{ node_exporter_container_image }}"
+      changed_when: false
+      failed_when: false
+
+- name: purge ceph monitoring
+  hosts: "{{ grafana_server_group_name | default('grafana-server') }}"
+  gather_facts: false
+  become: true
+  tasks:
+    - import_role:
+        name: ceph-defaults
+
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary
+
+    - name: stop services
+      service:
+        name: "{{ item }}"
+        state: stopped
+        enabled: no
+      failed_when: false
+      loop:
+        - alertmanager
+        - prometheus
+        - grafana-server
+
+    - name: remove systemd service files
+      file:
+        name: "/etc/systemd/system/{{ item }}.service"
+        state: absent
+      loop:
+        - alertmanager
+        - prometheus
+        - grafana-server
+
+    - name: remove ceph dashboard container images
+      command: "{{ container_binary }} rmi {{ item }}"
+      loop:
+        - "{{ alertmanager_container_image }}"
+        - "{{ prometheus_container_image }}"
+        - "{{ grafana_container_image }}"
+      changed_when: false
+      failed_when: false
+
+    - name: remove ceph-grafana-dashboards package on RedHat or SUSE
+      package:
+        name: ceph-grafana-dashboards
+        state: absent
+      when:
+        - not containerized_deployment | bool
+        - ansible_facts['os_family'] in ['RedHat', 'Suse']
+
+    - name: remove data
+      file:
+        name: "{{ item }}"
+        state: absent
+      loop:
+        - "{{ alertmanager_conf_dir }}"
+        - "{{ prometheus_conf_dir }}"
+        - /etc/grafana
+        - "{{ alertmanager_data_dir }}"
+        - "{{ prometheus_data_dir }}"
+        - /var/lib/grafana
+
+- name: purge ceph dashboard
+  hosts: "{{ groups[mgr_group_name] | default(groups[mon_group_name]) | default(omit) }}"
+  gather_facts: false
+  become: true
+  environment:
+    CEPH_CONTAINER_IMAGE: "{{ ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else None }}"
+    CEPH_CONTAINER_BINARY: "{{ container_binary }}"
+  tasks:
+    - import_role:
+        name: ceph-defaults
+
+    - import_role:
+        name: ceph-facts
+        tasks_from: container_binary
+
+    - name: set_fact ceph_cmd
+      set_fact:
+        ceph_cmd: "{{ hostvars[groups[mon_group_name][0]]['container_binary'] + ' run --net=host --rm -v /etc/ceph:/etc/ceph:z --entrypoint=ceph ' + ceph_docker_registry + '/' + ceph_docker_image + ':' + ceph_docker_image_tag if containerized_deployment | bool else 'ceph' }}"
+
+    - name: remove the dashboard admin user
+      ceph_dashboard_user:
+        name: "{{ dashboard_admin_user }}"
+        cluster: "{{ cluster }}"
+        state: absent
+      run_once: true
+      delegate_to: "{{ groups[mon_group_name][0] }}"
+
+    - name: remove radosgw system user
+      radosgw_user:
+        name: "{{ dashboard_rgw_api_user_id }}"
+        cluster: "{{ cluster }}"
+        state: absent
+      run_once: true
+      delegate_to: "{{ groups[mon_group_name][0] }}"
+      when: groups.get(rgw_group_name, []) | length > 0
+
+    - name: disable mgr dashboard and prometheus modules
+      command: "{{ ceph_cmd }} --cluster {{ cluster }} mgr module disable {{ item }}"
+      delegate_to: "{{ groups[mon_group_name][0] }}"
+      run_once: true
+      changed_when: false
+      loop:
+        - dashboard
+        - prometheus
+
+    - name: remove TLS certificate and key files
+      file:
+        name: "/etc/ceph/ceph-dashboard.{{ item }}"
+        state: absent
+      loop:
+        - crt
+        - key
+      when: dashboard_protocol == "https"
+
+    - name: remove ceph-mgr-dashboard package
+      package:
+        name: ceph-mgr-dashboard
+        state: absent
+      when: not containerized_deployment | bool
diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt}
+envlist = centos-{container,non_container}-{all_daemons,collocation,lvm_osds,shrink_mon,shrink_osd,shrink_mgr,shrink_mds,shrink_rbdmirror,shrink_rgw,lvm_batch,add_mons,add_mgrs,add_mdss,add_rbdmirrors,add_rgws,rgw_multisite,purge,storage_inventory,lvm_auto_discovery,all_in_one,cephadm_adopt,purge_dashboard}
   centos-non_container-{switch_to_containers}
   infra_lv_create
   migrate_ceph_disk_to_ceph_volume
@@ -133,6 +133,25 @@ commands=
   # test that the cluster can be redeployed in a healthy state
   py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
 
+[purge-dashboard]
+commands=
+  ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/purge-dashboard.yml --extra-vars "\
+      ireallymeanit=yes \
+      ceph_docker_registry={env:CEPH_DOCKER_REGISTRY:quay.ceph.io} \
+      ceph_docker_image={env:CEPH_DOCKER_IMAGE:ceph-ci/daemon} \
+      ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG:latest-octopus} \
+  "
+
+  # set up the cluster again
+  ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars @ceph-override.json --extra-vars "\
+      ceph_stable_release={env:CEPH_STABLE_RELEASE:octopus} \
+      ceph_docker_registry_auth=True \
+      ceph_docker_registry_username={env:DOCKER_HUB_USERNAME} \
+      ceph_docker_registry_password={env:DOCKER_HUB_PASSWORD} \
+  "
+  # test that the cluster can be redeployed in a healthy state
+  py.test --reruns 5 --reruns-delay 1 -n 8 --durations=0 --sudo -v --connection=ansible --ansible-inventory={changedir}/{env:INVENTORY} --ssh-config={changedir}/vagrant_ssh_config {toxinidir}/tests/functional/tests
+
 [purge-lvm]
 commands=
   ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/infrastructure-playbooks/{env:PURGE_PLAYBOOK:purge-cluster.yml} --extra-vars "\
@@ -365,6 +384,7 @@ changedir=
   # tests a 1 mon, 1 osd, 1 mds and 1 rgw centos7 cluster using docker
   collocation: {toxinidir}/tests/functional/collocation{env:CONTAINER_DIR:}
   purge: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
+  purge_dashboard: {toxinidir}/tests/functional/all_daemons{env:CONTAINER_DIR:}
   switch_to_containers: {toxinidir}/tests/functional/all_daemons
   lvm_osds: {toxinidir}/tests/functional/lvm-osds{env:CONTAINER_DIR:}
   lvm_batch: {toxinidir}/tests/functional/lvm-batch{env:CONTAINER_DIR:}
@@ -416,6 +436,7 @@ commands=
   all_daemons,all_in_one,collocation: ansible-playbook -vv -i {changedir}/{env:INVENTORY} {toxinidir}/{env:PLAYBOOK:site.yml.sample} --extra-vars "delegate_facts_host={env:DELEGATE_FACTS_HOST:True} ceph_stable_release={env:CEPH_STABLE_RELEASE:octopus} ceph_docker_image_tag={env:CEPH_DOCKER_IMAGE_TAG_BIS:latest-bis-octopus}" --extra-vars @ceph-override.json
 
   purge: {[purge]commands}
+  purge_dashboard: {[purge-dashboard]commands}
   switch_to_containers: {[switch-to-containers]commands}
   shrink_mon: {[shrink-mon]commands}
   shrink_osd: {[shrink-osd]commands}