From d4b9431dfe23a97075766fa2a2b76d7554543b0c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 18 May 2022 12:59:38 +0800 Subject: [PATCH] mds: wait unlink to finish to avoid conflict when creating same dentries If the previous unlink request has been delayed due to some reasons, and the new creating for the same dentry may fail or new open will succeeds but new contents wrote to it will be lost. The kernel client will make sure before the unlink getting the first reply it won't send the followed create requests for the same dentry. Here we need to make sure that before the first reply has been sent out the dentry must be marked as unlinking. Fixes: https://tracker.ceph.com/issues/55332 Signed-off-by: Xiubo Li --- src/mds/CDentry.h | 15 +++-- src/mds/MDCache.cc | 82 +++++++++++++++++++++--- src/mds/MDCache.h | 3 +- src/mds/Server.cc | 117 ++++++++++++++++++++++++++++++++--- src/mds/Server.h | 3 + src/messages/MDentryUnlink.h | 60 ++++++++++++++++-- src/msg/Message.cc | 3 + src/msg/Message.h | 1 + 8 files changed, 255 insertions(+), 29 deletions(-) diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 68d5d8b34c508..f6cbf49d009fb 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -29,6 +29,7 @@ #include "BatchOp.h" #include "MDSCacheObject.h" #include "MDSContext.h" +#include "Mutation.h" #include "SimpleLock.h" #include "LocalLockC.h" #include "ScrubHeader.h" @@ -86,18 +87,23 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter static const int STATE_EVALUATINGSTRAY = (1<<4); static const int STATE_PURGINGPINNED = (1<<5); static const int STATE_BOTTOMLRU = (1<<6); + static const int STATE_UNLINKING = (1<<7); // stray dentry needs notification of releasing reference static const int STATE_STRAY = STATE_NOTIFYREF; static const int MASK_STATE_IMPORT_KEPT = STATE_BOTTOMLRU; // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting - static const int PIN_PURGING = 3; - static const int PIN_SCRUBPARENT = 4; + static const int PIN_INODEPIN = 1; // linked inode is pinned + static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting + static const int PIN_PURGING = 3; + static const int PIN_SCRUBPARENT = 4; + static const int PIN_WAITUNLINKSTATE = 5; static const unsigned EXPORT_NONCE = 1; + const static uint64_t WAIT_UNLINK_STATE = (1<<0); + const static uint64_t WAIT_UNLINK_FINISH = (1<<1); + uint32_t replica_unlinking_ref = 0; CDentry(std::string_view n, __u32 h, mempool::mds_co::string alternate_name, @@ -136,6 +142,7 @@ class CDentry : public MDSCacheObject, public LRUObject, public Counter case PIN_FRAGMENTING: return "fragmenting"; case PIN_PURGING: return "purging"; case PIN_SCRUBPARENT: return "scrubparent"; + case PIN_WAITUNLINKSTATE: return "waitunlinkstate"; default: return generic_pin_name(p); } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 2d27cbf137ec4..10bd2527ed362 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8233,6 +8233,10 @@ void MDCache::dispatch(const cref_t &m) case MSG_MDS_DENTRYUNLINK: handle_dentry_unlink(ref_cast(m)); break; + case MSG_MDS_DENTRYUNLINK_ACK: + handle_dentry_unlink_ack(ref_cast(m)); + break; + case MSG_MDS_FRAGMENTNOTIFY: handle_fragment_notify(ref_cast(m)); @@ -11194,7 +11198,8 @@ void MDCache::handle_dentry_link(const cref_t &m) // UNLINK -void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr) +void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, + MDRequestRef& mdr, bool unlinking) { dout(10) << __func__ << " " << *dn << dendl; // share unlink news with replicas @@ -11206,6 +11211,11 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& md CInode *strayin = straydn->get_linkage()->get_inode(); strayin->encode_snap_blob(snapbl); } + + if (unlinking) { + ceph_assert(!straydn); + dn->replica_unlinking_ref = 0; + } for (set::iterator it = replicas.begin(); it != replicas.end(); ++it) { @@ -11218,12 +11228,21 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& md rejoin_gather.count(*it))) continue; - auto unlink = make_message(dn->get_dir()->dirfrag(), dn->get_name()); + auto unlink = make_message(dn->get_dir()->dirfrag(), + dn->get_name(), unlinking); if (straydn) { encode_replica_stray(straydn, *it, unlink->straybl); unlink->snapbl = snapbl; } mds->send_message_mds(unlink, *it); + if (unlinking) { + dn->replica_unlinking_ref++; + dn->get(CDentry::PIN_WAITUNLINKSTATE); + } + } + + if (unlinking && dn->replica_unlinking_ref) { + dn->add_waiter(CDentry::WAIT_UNLINK_STATE, new C_MDS_RetryRequest(this, mdr)); } } @@ -11232,23 +11251,40 @@ void MDCache::handle_dentry_unlink(const cref_t &m) // straydn CDentry *straydn = nullptr; CInode *strayin = nullptr; + if (m->straybl.length()) decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num())); + boost::intrusive_ptr ack; + CDentry::linkage_t *dnl; + CDentry *dn; + CInode *in; + bool hadrealm; + CDir *dir = get_dirfrag(m->get_dirfrag()); if (!dir) { dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + if (m->is_unlinking()) + goto ack; } else { - CDentry *dn = dir->lookup(m->get_dn()); + dn = dir->lookup(m->get_dn()); if (!dn) { dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + if (m->is_unlinking()) + goto ack; } else { dout(7) << __func__ << " on " << *dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); + + if (m->is_unlinking()) { + dn->state_set(CDentry::STATE_UNLINKING); + goto ack; + } + + dnl = dn->get_linkage(); // open inode? if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); + in = dnl->get_inode(); dn->dir->unlink_inode(dn); ceph_assert(straydn); straydn->dir->link_primary_inode(straydn, in); @@ -11259,11 +11295,12 @@ void MDCache::handle_dentry_unlink(const cref_t &m) in->first = straydn->first; // update subtree map? - if (in->is_dir()) + if (in->is_dir()) { adjust_subtree_after_rename(in, dir, false); + } if (m->snapbl.length()) { - bool hadrealm = (in->snaprealm ? true : false); + hadrealm = (in->snaprealm ? true : false); in->decode_snap_blob(m->snapbl); ceph_assert(in->snaprealm); if (!hadrealm) @@ -11274,7 +11311,7 @@ void MDCache::handle_dentry_unlink(const cref_t &m) if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS)) migrator->export_caps(in); - + straydn = NULL; } else { ceph_assert(!straydn); @@ -11282,6 +11319,7 @@ void MDCache::handle_dentry_unlink(const cref_t &m) dn->dir->unlink_inode(dn); } ceph_assert(dnl->is_null()); + dn->state_clear(CDentry::STATE_UNLINKING); } } @@ -11293,8 +11331,36 @@ void MDCache::handle_dentry_unlink(const cref_t &m) trim_dentry(straydn, ex); send_expire_messages(ex); } + return; + +ack: + ack = make_message(m->get_dirfrag(), m->get_dn()); + mds->send_message(ack, m->get_connection()); } +void MDCache::handle_dentry_unlink_ack(const cref_t &m) +{ + CDir *dir = get_dirfrag(m->get_dirfrag()); + if (!dir) { + dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl; + } else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl; + } else { + dout(7) << __func__ << " on " << *dn << " ref " + << dn->replica_unlinking_ref << " -> " + << dn->replica_unlinking_ref - 1 << dendl; + dn->replica_unlinking_ref--; + if (!dn->replica_unlinking_ref) { + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_STATE, finished); + mds->queue_waiters(finished); + } + dn->put(CDentry::PIN_WAITUNLINKSTATE); + } + } +} diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index bd109e2549075..7b544998b3c12 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -891,7 +891,7 @@ class MDCache { void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl); void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p); void send_dentry_link(CDentry *dn, MDRequestRef& mdr); - void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); + void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr, bool unlinking=false); void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) { uncommitted_fragments.at(dirfrag).waiters.push_back(c); @@ -1134,6 +1134,7 @@ class MDCache { void handle_discover_reply(const cref_t &m); void handle_dentry_link(const cref_t &m); void handle_dentry_unlink(const cref_t &m); + void handle_dentry_unlink_ack(const cref_t &m); int dump_cache(std::string_view fn, Formatter *f, double timeout); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index e8313712cdcb7..36b2d7502dcd6 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1940,20 +1940,23 @@ void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEv mdr->pin(dn); early_reply(mdr, in, dn); - + mdr->committing = true; submit_mdlog_entry(le, fin, mdr, __func__); - + if (mdr->client_request && mdr->client_request->is_queued_for_replay()) { if (mds->queue_one_replay()) { dout(10) << " queued next replay op" << dendl; } else { dout(10) << " journaled last replay op" << dendl; } - } else if (mdr->did_early_reply) + } else if (mdr->did_early_reply) { mds->locker->drop_rdlocks_for_early_reply(mdr.get()); - else + if (dn && dn->is_waiter_for(CDentry::WAIT_UNLINK_FINISH)) + mdlog->flush(); + } else { mdlog->flush(); + } } void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr, @@ -4451,6 +4454,11 @@ void Server::handle_client_openc(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDentry::linkage_t *dnl = dn->get_projected_linkage(); if (!excl && !dnl->is_null()) { // it existed. @@ -6692,6 +6700,44 @@ void Server::handle_client_getvxattr(MDRequestRef& mdr) // ------------------------------------------------ +struct C_WaitUnlinkToFinish : public MDSContext { +protected: + MDCache *mdcache; + CDentry *dn; + MDSContext *fin; + + MDSRank *get_mds() override + { + ceph_assert(mdcache != NULL); + return mdcache->mds; + } + +public: + C_WaitUnlinkToFinish(MDCache *m, CDentry *d, MDSContext *f) : + mdcache(m), dn(d), fin(f) {} + void finish(int r) override { + fin->complete(r); + dn->put(CDentry::PIN_PURGING); + } +}; + +bool Server::is_unlink_pending(CDentry *dn) +{ + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (!dnl->is_null() && dn->state_test(CDentry::STATE_UNLINKING)) { + return true; + } + return false; +} + +void Server::wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr) +{ + mds->locker->drop_locks(mdr.get()); + auto fin = new C_MDS_RetryRequest(mdcache, mdr); + dn->get(CDentry::PIN_PURGING); + dn->add_waiter(CDentry::WAIT_UNLINK_FINISH, new C_WaitUnlinkToFinish(mdcache, dn, fin)); +} + // MKNOD class C_MDS_mknod_finish : public ServerLogContext { @@ -6755,6 +6801,11 @@ void Server::handle_client_mknod(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); if (!check_access(mdr, diri, MAY_WRITE)) @@ -6853,6 +6904,11 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); @@ -6948,6 +7004,11 @@ void Server::handle_client_symlink(MDRequestRef& mdr) if (!dn) return; + if (is_unlink_pending(dn)) { + wait_for_pending_unlink(dn, mdr); + return; + } + CDir *dir = dn->get_dir(); CInode *diri = dir->get_inode(); @@ -7054,6 +7115,11 @@ void Server::handle_client_link(MDRequestRef& mdr) targeti = ret.second->get_projected_linkage()->get_inode(); } + if (is_unlink_pending(destdn)) { + wait_for_pending_unlink(destdn, mdr); + return; + } + ceph_assert(destdn->get_projected_linkage()->is_null()); if (req->get_alternate_name().size() > alternate_name_max) { dout(10) << " alternate_name longer than " << alternate_name_max << dendl; @@ -7338,11 +7404,17 @@ void Server::_link_remote_finish(MDRequestRef& mdr, bool inc, mdr->apply(); MDRequestRef null_ref; - if (inc) + if (inc) { mdcache->send_dentry_link(dn, null_ref); - else + } else { + dn->state_clear(CDentry::STATE_UNLINKING); mdcache->send_dentry_unlink(dn, NULL, null_ref); - + + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished); + mdcache->mds->queue_waiters(finished); + } + // bump target popularity mds->balancer->hit_inode(targeti, META_POP_IWR); mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR); @@ -7716,10 +7788,20 @@ void Server::handle_client_unlink(MDRequestRef& mdr) if (rmdir) mdr->disable_lock_cache(); + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true); if (!dn) return; + // notify replica MDSes the dentry is under unlink + if (!dn->state_test(CDentry::STATE_UNLINKING)) { + dn->state_set(CDentry::STATE_UNLINKING); + mdcache->send_dentry_unlink(dn, nullptr, mdr, true); + if (dn->replica_unlinking_ref) { + return; + } + } + CDentry::linkage_t *dnl = dn->get_linkage(client, mdr); ceph_assert(!dnl->is_null()); CInode *in = dnl->get_inode(); @@ -7994,9 +8076,14 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, } mdr->apply(); - + + dn->state_clear(CDentry::STATE_UNLINKING); mdcache->send_dentry_unlink(dn, straydn, mdr); - + + MDSContext::vec finished; + dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished); + mdcache->mds->queue_waiters(finished); + if (straydn) { // update subtree map? if (strayin->is_dir()) @@ -8011,7 +8098,7 @@ void Server::_unlink_local_finish(MDRequestRef& mdr, // reply respond_to_request(mdr, 0); - + // removing a new dn? dn->get_dir()->try_remove_unlinked_dn(dn); @@ -8470,6 +8557,16 @@ void Server::handle_client_rename(MDRequestRef& mdr) if (!destdn) return; + if (is_unlink_pending(destdn)) { + wait_for_pending_unlink(destdn, mdr); + return; + } + + if (is_unlink_pending(srcdn)) { + wait_for_pending_unlink(srcdn, mdr); + return; + } + dout(10) << " destdn " << *destdn << dendl; CDir *destdir = destdn->get_dir(); ceph_assert(destdir->is_auth()); diff --git a/src/mds/Server.h b/src/mds/Server.h index 160dda9853468..7d69651b9d6a3 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -234,6 +234,9 @@ class Server { void handle_client_fsync(MDRequestRef& mdr); + bool is_unlink_pending(CDentry *dn); + void wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr); + // open void handle_client_open(MDRequestRef& mdr); void handle_client_openc(MDRequestRef& mdr); // O_CREAT variant. diff --git a/src/messages/MDentryUnlink.h b/src/messages/MDentryUnlink.h index 210fa033c5219..fc52284416659 100644 --- a/src/messages/MDentryUnlink.h +++ b/src/messages/MDentryUnlink.h @@ -22,15 +22,17 @@ class MDentryUnlink final : public MMDSOp { private: - static constexpr int HEAD_VERSION = 1; + static constexpr int HEAD_VERSION = 2; static constexpr int COMPAT_VERSION = 1; - + dirfrag_t dirfrag; std::string dn; + bool unlinking = false; public: dirfrag_t get_dirfrag() const { return dirfrag; } const std::string& get_dn() const { return dn; } + bool is_unlinking() const { return unlinking; } ceph::buffer::list straybl; ceph::buffer::list snapbl; @@ -38,10 +40,9 @@ class MDentryUnlink final : public MMDSOp { protected: MDentryUnlink() : MMDSOp(MSG_MDS_DENTRYUNLINK, HEAD_VERSION, COMPAT_VERSION) { } - MDentryUnlink(dirfrag_t df, std::string_view n) : + MDentryUnlink(dirfrag_t df, std::string_view n, bool u=false) : MMDSOp(MSG_MDS_DENTRYUNLINK, HEAD_VERSION, COMPAT_VERSION), - dirfrag(df), - dn(n) {} + dirfrag(df), dn(n), unlinking(u) {} ~MDentryUnlink() final {} public: @@ -49,19 +50,66 @@ class MDentryUnlink final : public MMDSOp { void print(std::ostream& o) const override { o << "dentry_unlink(" << dirfrag << " " << dn << ")"; } - + void decode_payload() override { using ceph::decode; auto p = payload.cbegin(); decode(dirfrag, p); decode(dn, p); decode(straybl, p); + if (header.version >= 2) + decode(unlinking, p); } void encode_payload(uint64_t features) override { using ceph::encode; encode(dirfrag, payload); encode(dn, payload); encode(straybl, payload); + encode(unlinking, payload); + } +private: + template + friend boost::intrusive_ptr ceph::make_message(Args&&... args); + template + friend MURef crimson::make_message(Args&&... args); +}; + +class MDentryUnlinkAck final : public MMDSOp { +private: + static constexpr int HEAD_VERSION = 1; + static constexpr int COMPAT_VERSION = 1; + + dirfrag_t dirfrag; + std::string dn; + + public: + dirfrag_t get_dirfrag() const { return dirfrag; } + const std::string& get_dn() const { return dn; } + +protected: + MDentryUnlinkAck() : + MMDSOp(MSG_MDS_DENTRYUNLINK_ACK, HEAD_VERSION, COMPAT_VERSION) { } + MDentryUnlinkAck(dirfrag_t df, std::string_view n) : + MMDSOp(MSG_MDS_DENTRYUNLINK_ACK, HEAD_VERSION, COMPAT_VERSION), + dirfrag(df), dn(n) {} + ~MDentryUnlinkAck() final {} + +public: + std::string_view get_type_name() const override { return "dentry_unlink_ack";} + void print(std::ostream& o) const override { + o << "dentry_unlink_ack(" << dirfrag << " " << dn << ")"; + } + + void decode_payload() override { + using ceph::decode; + auto p = payload.cbegin(); + decode(dirfrag, p); + decode(dn, p); + } + void encode_payload(uint64_t features) override { + using ceph::encode; + encode(dirfrag, payload); + encode(dn, payload); } private: template diff --git a/src/msg/Message.cc b/src/msg/Message.cc index c6f5a3cdbbb30..b141ba848be3f 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -814,6 +814,9 @@ Message *decode_message(CephContext *cct, break; + case MSG_MDS_DENTRYUNLINK_ACK: + m = make_message(); + break; case MSG_MDS_DENTRYUNLINK: m = make_message(); break; diff --git a/src/msg/Message.h b/src/msg/Message.h index a7aff1e27a603..49ace3850c23f 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -173,6 +173,7 @@ #define MSG_MDS_OPENINOREPLY 0x210 #define MSG_MDS_SNAPUPDATE 0x211 #define MSG_MDS_FRAGMENTNOTIFYACK 0x212 +#define MSG_MDS_DENTRYUNLINK_ACK 0x213 #define MSG_MDS_LOCK 0x300 // 0x3xx are for locker of mds #define MSG_MDS_INODEFILECAPS 0x301