Skip to content

Commit

Permalink
Merge pull request #11126: jewel: add a tool to rebuild mon store fro…
Browse files Browse the repository at this point in the history
…m OSD

Reviewed-by: Loic Dachary <ldachary@redhat.com>
  • Loading branch information
Loic Dachary committed Oct 18, 2016
2 parents 23b0c78 + 25a35d4 commit 2a1e931
Show file tree
Hide file tree
Showing 8 changed files with 695 additions and 5 deletions.
82 changes: 82 additions & 0 deletions doc/rados/troubleshooting/troubleshooting-mon.rst
Expand Up @@ -383,6 +383,86 @@ example::

iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:7300 -j ACCEPT

Monitor Store Failures
======================

Symptoms of store corruption
----------------------------

Ceph monitor stores the `cluster map`_ in a key/value store such as LevelDB. If
a monitor fails due to the key/value store corruption, following error messages
might be found in the monitor log::

Corruption: error in middle of record

or::

Corruption: 1 missing files; e.g.: /var/lib/ceph/mon/mon.0/store.db/1234567.ldb

Recovery using healthy monitor(s)
---------------------------------

If there is any survivers, we can always `replace`_ the corrupted one with a
new one. And after booting up, the new joiner will sync up with a healthy
peer, and once it is fully sync'ed, it will be able to serve the clients.

Recovery using OSDs
-------------------

But what if all monitors fail at the same time? Since users are encouraged to
deploy at least three monitors in a Ceph cluster, the chance of simultaneous
failure is rare. But unplanned power-downs in a data center with improperly
configured disk/fs settings could fail the underlying filesystem, and hence
kill all the monitors. In this case, we can recover the monitor store with the
information stored in OSDs.::

ms=/tmp/mon-store
mkdir $ms
# collect the cluster map from OSDs
for host in $hosts; do
rsync -avz $ms user@host:$ms
rm -rf $ms
ssh user@host <<EOF
for osd in /var/lib/osd/osd-*; do
ceph-objectstore-tool --data-path $osd --op update-mon-db --mon-store-path $ms
done
EOF
rsync -avz user@host:$ms $ms
done
# rebuild the monitor store from the collected map, if the cluster does not
# use cephx authentication, we can skip the following steps to update the
# keyring with the caps, and there is no need to pass the "--keyring" option.
# i.e. just use "ceph-monstore-tool /tmp/mon-store rebuild" instead
ceph-authtool /path/to/admin.keyring -n mon. \
--cap mon allow 'allow *'
ceph-authtool /path/to/admin.keyring -n client.admin \
--cap mon allow 'allow *' --cap osd 'allow *' --cap mds 'allow *'
ceph-monstore-tool /tmp/mon-store rebuild -- --keyring /path/to/admin.keyring
# backup corrupted store.db just in case
mv /var/lib/ceph/mon/mon.0/store.db /var/lib/ceph/mon/mon.0/store.db.corrupted
mv /tmp/mon-store/store.db /var/lib/ceph/mon/mon.0/store.db

The steps above

#. collect the map from all OSD hosts,
#. then rebuild the store,
#. replace the corrupted store on ``mon.0`` with the recovered copy.

Known limitations
~~~~~~~~~~~~~~~~~

Following information are not recoverable using the steps above:

- **some added keyrings**: all the OSD keyrings added using ``ceph auth add`` command
are recovered from the OSD's copy. And the ``client.admin`` keyring is imported
using ``ceph-monstore-tool``. But the MDS keyrings and other keyrings are missing
in the recovered monitor store. You might need to re-add them manually.

- **pg settings**: the ``full ratio`` and ``nearfull ratio`` settings configured using
``ceph pg set_full_ratio`` and ``ceph pg set_nearfull_ratio`` will be lost.

- **MDS Maps**: the MDS maps are lost.


Everything Failed! Now What?
=============================
Expand Down Expand Up @@ -480,4 +560,6 @@ based on that.
Finally, you should reach out to us on the mailing lists, on IRC or file
a new issue on the `tracker`_.

.. _cluster map: ../../architecture#cluster-map
.. _replace: ../operation/add-or-rm-mons
.. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Expand Up @@ -464,6 +464,7 @@ install(TARGETS ceph-monstore-tool DESTINATION bin)

add_executable(ceph-objectstore-tool
tools/ceph_objectstore_tool.cc
tools/rebuild_mondb.cc
tools/RadosDump.cc
$<TARGET_OBJECTS:common_util_obj>)
target_link_libraries(ceph-objectstore-tool tcmalloc osd os global ${Boost_PROGRAM_OPTIONS_LIBRARY} fuse dl)
Expand Down
2 changes: 1 addition & 1 deletion src/mon/AuthMonitor.h
Expand Up @@ -35,11 +35,11 @@ class KeyRing;
#define MIN_GLOBAL_ID 0x1000

class AuthMonitor : public PaxosService {
public:
enum IncType {
GLOBAL_ID,
AUTH_DATA,
};
public:
struct Incremental {
IncType inc_type;
uint64_t max_global_id;
Expand Down
6 changes: 5 additions & 1 deletion src/tools/Makefile-server.am
Expand Up @@ -18,7 +18,11 @@ endif

if WITH_OSD

ceph_objectstore_tool_SOURCES = tools/ceph_objectstore_tool.cc tools/RadosDump.cc
ceph_objectstore_tool_SOURCES = \
tools/ceph_objectstore_tool.cc \
tools/rebuild_mondb.cc \
tools/rebuild_mondb.h \
tools/RadosDump.cc
ceph_objectstore_tool_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) $(BOOST_PROGRAM_OPTIONS_LIBS)
if LINUX
ceph_objectstore_tool_LDADD += -ldl
Expand Down
194 changes: 194 additions & 0 deletions src/tools/ceph_monstore_tool.cc
Expand Up @@ -20,8 +20,11 @@
#include "common/Formatter.h"
#include "common/errno.h"

#include "auth/KeyRing.h"
#include "auth/cephx/CephxKeyServer.h"
#include "global/global_init.h"
#include "include/stringify.h"
#include "mon/AuthMonitor.h"
#include "mon/MonitorDBStore.h"
#include "mon/Paxos.h"
#include "mon/MonMap.h"
Expand Down Expand Up @@ -217,6 +220,8 @@ void usage(const char *n, po::options_description &d)
<< " (rewrite-crush -- --help for more info)\n"
<< " inflate-pgmap [-- options] add given number of pgmaps to store\n"
<< " (inflate-pgmap -- --help for more info)\n"
<< " rebuild rebuild store\n"
<< " (rebuild -- --help for more info)\n"
<< std::endl;
std::cerr << d << std::endl;
std::cerr
Expand Down Expand Up @@ -509,6 +514,193 @@ int inflate_pgmap(MonitorDBStore& st, unsigned n, bool can_be_trimmed) {
return 0;
}

static int update_auth(MonitorDBStore& st, const string& keyring_path)
{
// import all keyrings stored in the keyring file
KeyRing keyring;
int r = keyring.load(g_ceph_context, keyring_path);
if (r < 0) {
cerr << "unable to load admin keyring: " << keyring_path << std::endl;
return r;
}

bufferlist bl;
__u8 v = 1;
::encode(v, bl);

for (const auto& k : keyring.get_keys()) {
KeyServerData::Incremental auth_inc;
auth_inc.name = k.first;
auth_inc.auth = k.second;
if (auth_inc.auth.caps.empty()) {
cerr << "no caps granted to: " << auth_inc.name << std::endl;
return -EINVAL;
}
auth_inc.op = KeyServerData::AUTH_INC_ADD;

AuthMonitor::Incremental inc;
inc.inc_type = AuthMonitor::AUTH_DATA;
::encode(auth_inc, inc.auth_data);
inc.auth_type = CEPH_AUTH_CEPHX;

inc.encode(bl, CEPH_FEATURES_ALL);
}

const string prefix("auth");
auto last_committed = st.get(prefix, "last_committed") + 1;
auto t = make_shared<MonitorDBStore::Transaction>();
t->put(prefix, last_committed, bl);
t->put(prefix, "last_committed", last_committed);
auto first_committed = st.get(prefix, "first_committed");
if (!first_committed) {
t->put(prefix, "first_committed", last_committed);
}
st.apply_transaction(t);
return 0;
}

static int update_mkfs(MonitorDBStore& st)
{
MonMap monmap;
int r = monmap.build_initial(g_ceph_context, cerr);
if (r) {
cerr << "no initial monitors" << std::endl;
return -EINVAL;
}
bufferlist bl;
monmap.encode(bl, CEPH_FEATURES_ALL);
monmap.set_epoch(0);
auto t = make_shared<MonitorDBStore::Transaction>();
t->put("mkfs", "monmap", bl);
st.apply_transaction(t);
return 0;
}

static int update_monitor(MonitorDBStore& st)
{
const string prefix("monitor");
// a stripped-down Monitor::mkfs()
bufferlist bl;
bl.append(CEPH_MON_ONDISK_MAGIC "\n");
auto t = make_shared<MonitorDBStore::Transaction>();
t->put(prefix, "magic", bl);
st.apply_transaction(t);
return 0;
}

static int update_paxos(MonitorDBStore& st)
{
// build a pending paxos proposal from all non-permanent k/v pairs. once the
// proposal is committed, it will gets applied. on the sync provider side, it
// will be a no-op, but on its peers, the paxos commit will help to build up
// the necessary epochs.
bufferlist pending_proposal;
{
MonitorDBStore::Transaction t;
vector<string> prefixes = {"auth", "osdmap",
"pgmap", "pgmap_pg", "pgmap_meta"};
for (const auto& prefix : prefixes) {
for (auto i = st.get_iterator(prefix); i->valid(); i->next()) {
auto key = i->raw_key();
auto val = i->value();
t.put(key.first, key.second, val);
}
}
t.encode(pending_proposal);
}
const string prefix("paxos");
auto t = make_shared<MonitorDBStore::Transaction>();
t->put(prefix, "first_committed", 0);
t->put(prefix, "last_committed", 0);
auto pending_v = 1;
t->put(prefix, pending_v, pending_proposal);
t->put(prefix, "pending_v", pending_v);
t->put(prefix, "pending_pn", 400);
st.apply_transaction(t);
return 0;
}

// rebuild
// - pgmap_meta/version
// - pgmap_meta/last_osdmap_epoch
// - pgmap_meta/last_pg_scan
// - pgmap_meta/full_ratio
// - pgmap_meta/nearfull_ratio
// - pgmap_meta/stamp
static int update_pgmap_meta(MonitorDBStore& st)
{
const string prefix("pgmap_meta");
auto t = make_shared<MonitorDBStore::Transaction>();
// stolen from PGMonitor::create_pending()
// the first pgmap_meta
t->put(prefix, "version", 1);
{
auto stamp = ceph_clock_now(g_ceph_context);
bufferlist bl;
::encode(stamp, bl);
t->put(prefix, "stamp", bl);
}
{
auto last_osdmap_epoch = st.get("osdmap", "last_committed");
t->put(prefix, "last_osdmap_epoch", last_osdmap_epoch);
}
// be conservative, so PGMonitor will scan the all pools for pg changes
t->put(prefix, "last_pg_scan", 1);
{
auto full_ratio = g_ceph_context->_conf->mon_osd_full_ratio;
if (full_ratio > 1.0)
full_ratio /= 100.0;
bufferlist bl;
::encode(full_ratio, bl);
t->put(prefix, "full_ratio", bl);
}
{
auto nearfull_ratio = g_ceph_context->_conf->mon_osd_nearfull_ratio;
if (nearfull_ratio > 1.0)
nearfull_ratio /= 100.0;
bufferlist bl;
::encode(nearfull_ratio, bl);
t->put(prefix, "nearfull_ratio", bl);
}
st.apply_transaction(t);
return 0;
}

int rebuild_monstore(const char* progname,
vector<string>& subcmds,
MonitorDBStore& st)
{
po::options_description op_desc("Allowed 'rebuild' options");
string keyring_path;
op_desc.add_options()
("keyring", po::value<string>(&keyring_path),
"path to the client.admin key");
po::variables_map op_vm;
int r = parse_cmd_args(&op_desc, nullptr, nullptr, subcmds, &op_vm);
if (r) {
return -r;
}
if (op_vm.count("help")) {
usage(progname, op_desc);
return 0;
}
if (!keyring_path.empty())
update_auth(st, keyring_path);
if ((r = update_pgmap_meta(st))) {
return r;
}
if ((r = update_paxos(st))) {
return r;
}
if ((r = update_mkfs(st))) {
return r;
}
if ((r = update_monitor(st))) {
return r;
}
return 0;
}

int main(int argc, char **argv) {
int err = 0;
po::options_description desc("Allowed options");
Expand Down Expand Up @@ -1094,6 +1286,8 @@ int main(int argc, char **argv) {
goto done;
}
err = inflate_pgmap(st, n, can_be_trimmed);
} else if (cmd == "rebuild") {
err = rebuild_monstore(argv[0], subcmds, st);
} else {
std::cerr << "Unrecognized command: " << cmd << std::endl;
usage(argv[0], desc);
Expand Down

0 comments on commit 2a1e931

Please sign in to comment.