From bee73d2429628e7d27cf9b1ca67eb5d5f049f285 Mon Sep 17 00:00:00 2001 From: Yang Honggang Date: Thu, 13 Apr 2017 20:09:07 +0800 Subject: [PATCH] cephfs: fix write_buf's _len overflow problem After I have set about 400 64KB xattr kv pair to a file, mds is crashed. Every time I try to start mds, it will crash again. The root reason is write_buf._len overflowed when doing Journaler::append_entry(). This patch try to fix this problem through the following changes: 1. limit file/dir's xattr size 2. throttle journal entry append operations Fixes: http://tracker.ceph.com/issues/19033 Signed-off-by: Yang Honggang joseph.yang@xtaotech.com (cherry picked from commit eb915d0eeccbe523f8f70f6571880003ff459459) --- src/common/config_opts.h | 2 ++ src/mds/Server.cc | 26 +++++++++++++++++++++++--- src/osdc/Journaler.cc | 26 +++++++++++++++++++++++--- src/osdc/Journaler.h | 13 ++++++++++++- 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 132d18970894e..b03597a0f3b69 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -475,6 +475,8 @@ OPTION(journaler_batch_interval, OPT_DOUBLE, .001) // seconds.. max add latenc OPTION(journaler_batch_max, OPT_U64, 0) // max bytes we'll delay flushing; disable, for now.... OPTION(mds_data, OPT_STR, "/var/lib/ceph/mds/$cluster-$id") OPTION(mds_max_file_size, OPT_U64, 1ULL << 40) // Used when creating new CephFS. Change with 'ceph mds set max_file_size ' afterwards +// max xattr kv pairs size for each dir/file +OPTION(mds_max_xattr_pairs_size, OPT_U32, 64 << 10) OPTION(mds_cache_size, OPT_INT, 100000) OPTION(mds_cache_mid, OPT_FLOAT, .7) OPTION(mds_max_file_recover, OPT_U32, 32) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 007ee12802ee0..c8a55f4997253 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3387,7 +3387,8 @@ void Server::handle_client_readdir(MDRequestRef& mdr) max = dir->get_num_any(); // whatever, something big. unsigned max_bytes = req->head.args.readdir.max_bytes; if (!max_bytes) - max_bytes = 512 << 10; // 512 KB? + // make sure at least one item can be encoded + max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size; // start final blob bufferlist dirbl; @@ -4506,6 +4507,25 @@ void Server::handle_client_setxattr(MDRequestRef& mdr) return; map *pxattrs = cur->get_projected_xattrs(); + size_t len = req->get_data().length(); + size_t inc = len + name.length(); + + // check xattrs kv pairs size + size_t cur_xattrs_size = 0; + for (const auto& p : *pxattrs) { + if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) { + continue; + } + cur_xattrs_size += p.first.length() + p.second.length(); + } + + if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) { + dout(10) << "xattr kv pairs size too big. cur_xattrs_size " + << cur_xattrs_size << ", inc " << inc << dendl; + respond_to_request(mdr, -ENOSPC); + return; + } + if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) { dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl; respond_to_request(mdr, -EEXIST); @@ -4517,7 +4537,6 @@ void Server::handle_client_setxattr(MDRequestRef& mdr) return; } - int len = req->get_data().length(); dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl; // project update @@ -7996,7 +8015,8 @@ void Server::handle_client_lssnap(MDRequestRef& mdr) max_entries = infomap.size(); int max_bytes = req->head.args.readdir.max_bytes; if (!max_bytes) - max_bytes = 512 << 10; + // make sure at least one item can be encoded + max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size; __u64 last_snapid = 0; string offset_str = req->get_path2(); diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc index 4b517cc34ce6e..59271b81bc7f4 100644 --- a/src/osdc/Journaler.cc +++ b/src/osdc/Journaler.cc @@ -537,7 +537,7 @@ void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp) uint64_t Journaler::append_entry(bufferlist& bl) { - lock_guard l(lock); + unique_lock l(lock); assert(!readonly); uint32_t s = bl.length(); @@ -556,6 +556,15 @@ uint64_t Journaler::append_entry(bufferlist& bl) bufferptr bp(write_pos - owp); bp.zero(); assert(bp.length() >= 4); + if (!write_buf_throttle.get_or_fail(bp.length())) { + l.unlock(); + ldout(cct, 10) << "write_buf_throttle wait, bp len " + << bp.length() << dendl; + write_buf_throttle.get(bp.length()); + l.lock(); + } + ldout(cct, 20) << "write_buf_throttle get, bp len " + << bp.length() << dendl; write_buf.push_back(bp); // now flush. @@ -569,6 +578,15 @@ uint64_t Journaler::append_entry(bufferlist& bl) // append + size_t delta = bl.length() + journal_stream.get_envelope_size(); + // write_buf space is nearly full + if (!write_buf_throttle.get_or_fail(delta)) { + l.unlock(); + ldout(cct, 10) << "write_buf_throttle wait, delta " << delta << dendl; + write_buf_throttle.get(delta); + l.lock(); + } + ldout(cct, 20) << "write_buf_throttle get, delta " << delta << dendl; size_t wrote = journal_stream.write(bl, &write_buf, write_pos); ldout(cct, 10) << "append_entry len " << s << " to " << write_pos << "~" << wrote << dendl; @@ -598,7 +616,7 @@ void Journaler::_do_flush(unsigned amount) assert(!readonly); // flush - unsigned len = write_pos - flush_pos; + uint64_t len = write_pos - flush_pos; assert(len == write_buf.length()); if (amount && amount < len) len = amount; @@ -654,7 +672,9 @@ void Journaler::_do_flush(unsigned amount) flush_pos += len; assert(write_buf.length() == write_pos - flush_pos); - + write_buf_throttle.put(len); + ldout(cct, 20) << "write_buf_throttle put, len " << len << dendl; + ldout(cct, 10) << "_do_flush (prezeroing/prezero)/write/flush/safe pointers now at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h index 6c7e7cf9db4e7..df2cadfcca8ab 100644 --- a/src/osdc/Journaler.h +++ b/src/osdc/Journaler.h @@ -64,7 +64,7 @@ #include "Filer.h" #include "common/Timer.h" - +#include "common/Throttle.h" class CephContext; class Context; @@ -113,6 +113,13 @@ class JournalStream bool readable(bufferlist &bl, uint64_t *need) const; size_t read(bufferlist &from, bufferlist *to, uint64_t *start_ptr); size_t write(bufferlist &entry, bufferlist *to, uint64_t const &start_ptr); + size_t get_envelope_size() const { + if (format >= JOURNAL_FORMAT_RESILIENT) { + return JOURNAL_ENVELOPE_RESILIENT; + } else { + return JOURNAL_ENVELOPE_LEGACY; + } + } // A magic number for the start of journal entries, so that we can // identify them in damaged journals. @@ -295,6 +302,9 @@ class Journaler { bufferlist write_buf; ///< write buffer. flush_pos + /// write_buf.length() == write_pos. + // protect write_buf from bufferlist _len overflow + Throttle write_buf_throttle; + bool waiting_for_zero; interval_set pending_zero; // non-contig bits we've zeroed std::set pending_safe; @@ -390,6 +400,7 @@ class Journaler { timer(tim), delay_flush_event(0), state(STATE_UNDEF), error(0), prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0), safe_pos(0), + write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX >> 3)), waiting_for_zero(false), read_pos(0), requested_pos(0), received_pos(0), fetch_len(0), temp_fetch_len(0),