Skip to content

Commit

Permalink
Merge pull request #24639 from dmick/wip-crashdump-mimic-backport
Browse files Browse the repository at this point in the history
mimic: mgr: crashdump feature backport
  • Loading branch information
liewegas committed Apr 5, 2019
2 parents ef85320 + 1feaa08 commit 5ae3e4b
Show file tree
Hide file tree
Showing 52 changed files with 958 additions and 57 deletions.
16 changes: 11 additions & 5 deletions ceph.spec.in
Expand Up @@ -1007,6 +1007,8 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/mon
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/osd
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/mds
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/mgr
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/crash
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/crash/posted
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/radosgw
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-mds
Expand All @@ -1028,6 +1030,7 @@ rm -rf %{buildroot}
%files

%files base
%{_bindir}/ceph-crash
%{_bindir}/crushtool
%{_bindir}/monmaptool
%{_bindir}/osdmaptool
Expand All @@ -1046,6 +1049,7 @@ rm -rf %{buildroot}
%{_libdir}/ceph/erasure-code/libec_*.so*
%dir %{_libdir}/ceph/compressor
%{_libdir}/ceph/compressor/libceph_*.so*
%{_unitdir}/ceph-crash.service
%ifarch x86_64
%dir %{_libdir}/ceph/crypto
%{_libdir}/ceph/crypto/libceph_*.so*
Expand Down Expand Up @@ -1091,6 +1095,8 @@ rm -rf %{buildroot}
%{_mandir}/man8/monmaptool.8*
%{_mandir}/man8/ceph-kvstore-tool.8*
#set up placeholder directories
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/crash
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/crash/posted
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/tmp
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
Expand All @@ -1103,22 +1109,22 @@ rm -rf %{buildroot}
%if 0%{?suse_version}
%fillup_only
if [ $1 -eq 1 ] ; then
/usr/bin/systemctl preset ceph-disk@\*.service ceph.target >/dev/null 2>&1 || :
/usr/bin/systemctl preset ceph-disk@\*.service ceph.target ceph-crash.service >/dev/null 2>&1 || :
fi
%endif
%if 0%{?fedora} || 0%{?rhel}
%systemd_post ceph-disk@\*.service ceph.target
%systemd_post ceph-disk@\*.service ceph.target ceph-crash.service
%endif
if [ $1 -eq 1 ] ; then
/usr/bin/systemctl start ceph.target >/dev/null 2>&1 || :
/usr/bin/systemctl start ceph.target ceph-crash.service >/dev/null 2>&1 || :
fi

%preun base
%if 0%{?suse_version}
%service_del_preun ceph-disk@\*.service ceph.target
%service_del_preun ceph-disk@\*.service ceph.target ceph-crash.service
%endif
%if 0%{?fedora} || 0%{?rhel}
%systemd_preun ceph-disk@\*.service ceph.target
%systemd_preun ceph-disk@\*.service ceph.target ceph-crash.service
%endif

%postun base
Expand Down
2 changes: 2 additions & 0 deletions debian/ceph-base.dirs
Expand Up @@ -4,3 +4,5 @@ var/lib/ceph/bootstrap-osd
var/lib/ceph/bootstrap-rgw
var/lib/ceph/bootstrap-rbd
var/lib/ceph/tmp
var/lib/ceph/crash
var/lib/ceph/crash/posted
2 changes: 2 additions & 0 deletions debian/ceph-base.install
@@ -1,4 +1,6 @@
etc/init.d/ceph
lib/systemd/system/ceph-crash.service
usr/bin/ceph-crash
usr/bin/ceph-debugpack
usr/bin/ceph-detect-init
usr/bin/ceph-run
Expand Down
60 changes: 60 additions & 0 deletions doc/mgr/crash.rst
@@ -0,0 +1,60 @@
Crash plugin
============
The crash plugin collects information about daemon crashdumps and stores
it in the Ceph cluster for later analysis.

Daemon crashdumps are dumped in /var/lib/ceph/crash by default; this can
be configured with the option 'crash dir'. Crash directories are named by
time and date and a randomly-generated UUID, and contain a metadata file
'meta' and a recent log file, with a "crash_id" that is the same.
This plugin allows the metadata about those dumps to be persisted in
the monitors' storage.

Enabling
--------

The *crash* module is enabled with::

ceph mgr module enable crash

Commands
--------
::

ceph crash post -i <metafile>

Save a crash dump. The metadata file is a JSON blob stored in the crash
dir as ``meta``. As usual, the ceph command can be invoked with ``-i -``,
and will read from stdin.

::

ceph rm <crashid>

Remove a specific crash dump.

::

ceph crash ls

List the timestamp/uuid crashids for all saved crash info.

::

ceph crash stat

Show a summary of saved crash info grouped by age.

::

ceph crash info <crashid>

Show all details of a saved crash.

::

ceph crash prune <keep>

Remove saved crashes older than 'keep' days. <keep> must be an integer.


1 change: 1 addition & 0 deletions doc/mgr/index.rst
Expand Up @@ -38,3 +38,4 @@ sensible.
Telemetry plugin <telemetry>
Telegraf plugin <telegraf>
Iostat plugin <iostat>
Crash plugin <crash>
16 changes: 16 additions & 0 deletions qa/suites/rados/mgr/tasks/crash.yaml
@@ -0,0 +1,16 @@

tasks:
- install:
- ceph:
# tests may leave mgrs broken, so don't try and call into them
# to invoke e.g. pg dump during teardown.
wait-for-scrub: false
log-whitelist:
- overall HEALTH_
- \(MGR_DOWN\)
- \(PG_
- replacing it with standby
- No standby daemons available
- cephfs_test_runner:
modules:
- tasks.mgr.test_crash
14 changes: 14 additions & 0 deletions qa/suites/rados/singleton/all/test-crash.yaml
@@ -0,0 +1,14 @@
roles:
- [client.0, mon.a, mgr.x, osd.0, osd.1, osd.2]

tasks:
- install:
- ceph:
log-whitelist:
- Reduced data availability
- OSD_.*DOWN
- workunit:
clients:
client.0:
- rados/test_crash.sh
- ceph.restart: [osd.*]
9 changes: 4 additions & 5 deletions qa/tasks/ceph_manager.py
Expand Up @@ -1135,7 +1135,7 @@ def raw_cluster_cmd(self, *args):
)
return proc.stdout.getvalue()

def raw_cluster_cmd_result(self, *args):
def raw_cluster_cmd_result(self, *args, **kwargs):
"""
Start ceph on a cluster. Return success or failure information.
"""
Expand All @@ -1152,10 +1152,9 @@ def raw_cluster_cmd_result(self, *args):
self.cluster,
]
ceph_args.extend(args)
proc = self.controller.run(
args=ceph_args,
check_status=False,
)
kwargs['args'] = ceph_args
kwargs['check_status'] = False
proc = self.controller.run(**kwargs)
return proc.exitstatus

def run_ceph_w(self):
Expand Down
108 changes: 108 additions & 0 deletions qa/tasks/mgr/test_crash.py
@@ -0,0 +1,108 @@


from mgr_test_case import MgrTestCase

import json
import logging
import datetime

log = logging.getLogger(__name__)
UUID = 'd5775432-0742-44a3-a435-45095e32e6b1'
DATEFMT = '%Y-%m-%d %H:%M:%S.%f'


class TestCrash(MgrTestCase):

def setUp(self):
self.setup_mgrs()
self._load_module('crash')

# Whip up some crash data
self.crashes = dict()
now = datetime.datetime.utcnow()

for i in (0, 1, 3, 4, 8):
timestamp = now - datetime.timedelta(days=i)
timestamp = timestamp.strftime(DATEFMT) + 'Z'
crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
self.crashes[crash_id] = {
'crash_id': crash_id, 'timestamp': timestamp,
}

self.assertEqual(
0,
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'post', '-i', '-',
stdin=json.dumps(self.crashes[crash_id]),
)
)

retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'ls',
)
log.warning("setUp: crash ls returns %s" % retstr)

self.oldest_crashid = crash_id

def tearDown(self):
for crash in self.crashes.itervalues():
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'rm', crash['crash_id']
)

def test_info(self):
for crash in self.crashes.itervalues():
log.warning('test_info: crash %s' % crash)
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'ls'
)
log.warning('ls output: %s' % retstr)
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'info', crash['crash_id'],
)
log.warning('crash info output: %s' % retstr)
crashinfo = json.loads(retstr)
self.assertIn('crash_id', crashinfo)
self.assertIn('timestamp', crashinfo)

def test_ls(self):
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'ls',
)
for crash in self.crashes.itervalues():
self.assertIn(crash['crash_id'], retstr)

def test_rm(self):
crashid = self.crashes.keys()[0]
self.assertEqual(
0,
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'rm', crashid,
)
)

retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'ls',
)
self.assertNotIn(crashid, retstr)

def test_stat(self):
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'stat',
)
self.assertIn('5 crashes recorded', retstr)
self.assertIn('4 older than 1 days old:', retstr)
self.assertIn('3 older than 3 days old:', retstr)
self.assertIn('1 older than 7 days old:', retstr)

def test_prune(self):
self.assertEqual(
0,
self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
'crash', 'prune', '5'
)
)
retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd(
'crash', 'ls',
)
self.assertNotIn(self.oldest_crashid, retstr)
3 changes: 3 additions & 0 deletions qa/tasks/mgr/test_module_selftest.py
Expand Up @@ -58,6 +58,9 @@ def test_selftest_run(self):
def test_telemetry(self):
self._selftest_plugin("telemetry")

def test_crash(self):
self._selftest_plugin("crash")

def test_selftest_config_update(self):
"""
That configuration updates are seen by running mgr modules
Expand Down
9 changes: 5 additions & 4 deletions qa/tasks/vstart_runner.py
Expand Up @@ -558,19 +558,20 @@ def run_ceph_w(self):
proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph"), "-w"], wait=False, stdout=StringIO())
return proc

def raw_cluster_cmd(self, *args):
def raw_cluster_cmd(self, *args, **kwargs):
"""
args like ["osd", "dump"}
return stdout string
"""
proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args))
proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), **kwargs)
return proc.stdout.getvalue()

def raw_cluster_cmd_result(self, *args):
def raw_cluster_cmd_result(self, *args, **kwargs):
"""
like raw_cluster_cmd but don't check status, just return rc
"""
proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), check_status=False)
kwargs['check_status'] = False
proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), **kwargs)
return proc.exitstatus

def admin_socket(self, daemon_type, daemon_id, command, check_status=True):
Expand Down
33 changes: 33 additions & 0 deletions qa/workunits/rados/test_crash.sh
@@ -0,0 +1,33 @@
#!/bin/sh

set -x

# run on a single-node three-OSD cluster

sudo killall -ABRT ceph-osd
sleep 5

# kill caused coredumps; find them and delete them, carefully, so as
# not to disturb other coredumps, or else teuthology will see them
# and assume test failure. sudos are because the core files are
# root/600
for f in $(find $TESTDIR/archive/coredump -type f); do
gdb_output=$(echo "quit" | sudo gdb /usr/bin/ceph-osd $f)
if expr match "$gdb_output" ".*generated.*ceph-osd.*" && \
( \

expr match "$gdb_output" ".*terminated.*signal 6.*" || \
expr match "$gdb_output" ".*terminated.*signal SIGABRT.*" \
)
then
sudo rm $f
fi
done

# let daemon find crashdumps on startup
sudo systemctl restart ceph-crash
sleep 30

# must be 3 crashdumps registered and moved to crash/posted
[ $(ceph crash ls | wc -l) = 3 ] || exit 1
[ $(sudo find /var/lib/ceph/crash/posted/ -name meta | wc -l) = 3 ] || exit 1
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Expand Up @@ -950,6 +950,9 @@ configure_file(${CMAKE_SOURCE_DIR}/src/init-ceph.in
configure_file(ceph-post-file.in
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph-post-file @ONLY)

configure_file(ceph-crash.in
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph-crash @ONLY)

if(WITH_TESTS)
install(PROGRAMS
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph-debugpack
Expand All @@ -960,6 +963,7 @@ endif()
install(PROGRAMS
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph-post-file
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ceph-crash
${CMAKE_SOURCE_DIR}/src/ceph-run
${CMAKE_SOURCE_DIR}/src/ceph-clsinfo
DESTINATION bin)
Expand Down

0 comments on commit 5ae3e4b

Please sign in to comment.