Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pacific: mds: do not take the ino which has been used #51508

Merged
merged 3 commits into from Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 19 additions & 0 deletions doc/cephfs/mds-config-ref.rst
Expand Up @@ -501,6 +501,25 @@
:Type: 32-bit Integer
:Default: ``0``

``mds_inject_skip_replaying_inotable``

:Description: Ceph will skip replaying the inotable when replaying the journal,
and the premary MDS will crash, while the replacing MDS won't.
(for developers only).

:Type: Boolean
:Default: ``false``


``mds_kill_skip_replaying_inotable``

:Description: Ceph will skip replaying the inotable when replaying the journal,
and the premary MDS will crash, while the replacing MDS won't.
(for developers only).

:Type: Boolean
:Default: ``false``


``mds_wipe_sessions``

Expand Down
1 change: 1 addition & 0 deletions qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml
Expand Up @@ -11,3 +11,4 @@ overrides:
- has not responded to cap revoke by MDS for over
- MDS_CLIENT_LATE_RELEASE
- responding to mclientcaps
- RECENT_CRASH
43 changes: 43 additions & 0 deletions qa/tasks/cephfs/test_misc.py
Expand Up @@ -414,3 +414,46 @@ def test_drop_cache_command_dead(self):
# particular operation causing this is journal flush which causes the
# MDS to wait wait for cap revoke.
self.mount_a.resume_netns()

class TestSkipReplayInoTable(CephFSTestCase):
MDSS_REQUIRED = 1
CLIENTS_REQUIRED = 1

def test_alloc_cinode_assert(self):
"""
Test alloc CInode assert.

See: https://tracker.ceph.com/issues/52280
"""

# Create a directory and the mds will journal this and then crash
self.mount_a.run_shell(["rm", "-rf", "test_alloc_ino"])
self.mount_a.run_shell(["mkdir", "test_alloc_ino"])

status = self.fs.status()
rank0 = self.fs.get_rank(rank=0, status=status)

self.fs.mds_asok(['config', 'set', 'mds_kill_skip_replaying_inotable', "true"])
# This will make the MDS crash, since we only have one MDS in the
# cluster and without the "wait=False" it will stuck here forever.
self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir1"], wait=False)

# sleep 10 seconds to make sure the journal logs are flushed and
# the mds crashes
time.sleep(10)

# Now set the mds config to skip replaying the inotable
self.fs.set_ceph_conf('mds', 'mds_inject_skip_replaying_inotable', True)
self.fs.set_ceph_conf('mds', 'mds_wipe_sessions', True)

self.fs.mds_restart()
# sleep 5 seconds to make sure the mds tell command won't stuck
time.sleep(5)
self.fs.wait_for_daemons()

self.delete_mds_coredump(rank0['name']);

self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir2"])

ls_out = set(self.mount_a.ls("test_alloc_ino/"))
self.assertEqual(ls_out, set({"dir1", "dir2"}))
2 changes: 2 additions & 0 deletions src/common/legacy_config_opts.h
Expand Up @@ -130,6 +130,8 @@ OPTION(ms_connection_idle_timeout, OPT_U64)
OPTION(ms_pq_max_tokens_per_priority, OPT_U64)
OPTION(ms_pq_min_cost, OPT_U64)
OPTION(ms_inject_socket_failures, OPT_U64)
OPTION(mds_inject_skip_replaying_inotable, OPT_BOOL)
OPTION(mds_kill_skip_replaying_inotable, OPT_BOOL)
SAFE_OPTION(ms_inject_delay_type, OPT_STR) // "osd mds mon client" allowed
OPTION(ms_inject_delay_max, OPT_DOUBLE) // seconds
OPTION(ms_inject_delay_probability, OPT_DOUBLE) // range [0, 1]
Expand Down
8 changes: 8 additions & 0 deletions src/common/options.cc
Expand Up @@ -8775,6 +8775,14 @@ std::vector<Option> get_mds_options() {
.set_default(false)
.set_description(""),

Option("mds_kill_skip_replaying_inotable", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description("Ceph will skip replaying the inotable when replaying the journal, and the premary MDS will crash, while the replacing MDS won't. (for testing only)"),

Option("mds_inject_skip_replaying_inotable", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description("Ceph will skip replaying the inotable when replaying the journal, and the premary MDS will crash, while the replacing MDS won't. (for testing only)"),

Option("mds_inject_traceless_reply_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
Expand Down
1 change: 1 addition & 0 deletions src/mds/CDir.cc
Expand Up @@ -1909,6 +1909,7 @@ CDentry *CDir::_load_dentry(

if (!undef_inode) {
mdcache->add_inode(in); // add
mdcache->insert_taken_inos(in->ino());
dn = add_primary_dentry(dname, in, std::move(alternate_name), first, last); // link
}
dout(12) << "_fetched got " << *dn << " " << *in << dendl;
Expand Down
2 changes: 2 additions & 0 deletions src/mds/MDCache.cc
Expand Up @@ -320,6 +320,8 @@ void MDCache::remove_inode(CInode *o)
snap_inode_map.erase(o->vino());
}

clear_taken_inos(o->ino());

if (o->ino() < MDS_INO_SYSTEM_BASE) {
if (o == root) root = 0;
if (o == myin) myin = 0;
Expand Down
15 changes: 15 additions & 0 deletions src/mds/MDCache.h
Expand Up @@ -193,6 +193,19 @@ class MDCache {
explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
~MDCache();

void insert_taken_inos(inodeno_t ino) {
replay_taken_inos.insert(ino);
}
void clear_taken_inos(inodeno_t ino) {
replay_taken_inos.erase(ino);
}
bool test_and_clear_taken_inos(inodeno_t ino) {
return replay_taken_inos.erase(ino) != 0;
}
bool is_taken_inos_empty(void) {
return replay_taken_inos.empty();
}

uint64_t cache_limit_memory(void) {
return cache_memory_limit;
}
Expand Down Expand Up @@ -1216,6 +1229,8 @@ class MDCache {
StrayManager stray_manager;

private:
std::set<inodeno_t> replay_taken_inos; // the inos have been taken when replaying

// -- fragmenting --
struct ufragment {
ufragment() {}
Expand Down
52 changes: 41 additions & 11 deletions src/mds/Server.cc
Expand Up @@ -3316,17 +3316,36 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
// while session is opening.
bool allow_prealloc_inos = mdr->session->is_open();

inodeno_t _useino = useino;

// assign ino
if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
mds->sessionmap.mark_projected(mdr->session);
dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
<< " (" << mdr->session->info.prealloc_inos.size() << " left)"
<< dendl;
} else {
mdr->alloc_ino =
_inode->ino = mds->inotable->project_alloc_id(useino);
dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
}
do {
if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(_useino))) {
if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
_inode->ino = 0;
dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
<< " (" << mdr->session->info.prealloc_inos.size() << " left)"
<< " but has been taken, will try again!" << dendl;
} else {
mds->sessionmap.mark_projected(mdr->session);
dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
<< " (" << mdr->session->info.prealloc_inos.size() << " left)"
<< dendl;
}
} else {
mdr->alloc_ino =
_inode->ino = mds->inotable->project_alloc_id(_useino);
if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
mds->inotable->apply_alloc_id(_inode->ino);
_inode->ino = 0;
dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino
<< " but has been taken, will try again!" << dendl;
} else {
dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
}
}
_useino = 0;
} while (!_inode->ino);

if (useino && useino != _inode->ino) {
dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
Expand All @@ -3335,7 +3354,7 @@ CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino
<< " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
//ceph_abort(); // just for now.
}

if (allow_prealloc_inos &&
mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
Expand Down Expand Up @@ -4416,6 +4435,9 @@ class C_MDS_openc_finish : public ServerLogContext {
void finish(int r) override {
ceph_assert(r == 0);

// crash current MDS and the replacing MDS will test the journal
ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);

dn->pop_projected_linkage();

// dirty inode, dn, dir
Expand Down Expand Up @@ -6688,6 +6710,9 @@ class C_MDS_mknod_finish : public ServerLogContext {
void finish(int r) override {
ceph_assert(r == 0);

// crash current MDS and the replacing MDS will test the journal
ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);

// link the inode
dn->pop_projected_linkage();

Expand Down Expand Up @@ -6994,6 +7019,11 @@ void Server::handle_client_symlink(MDRequestRef& mdr)

journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
mds->balancer->maybe_fragment(dir, false);

// flush the journal as soon as possible
if (g_conf()->mds_kill_skip_replaying_inotable) {
mdlog->flush();
}
}


Expand Down
2 changes: 1 addition & 1 deletion src/mds/events/EMetaBlob.h
Expand Up @@ -600,7 +600,7 @@ class EMetaBlob {
}

void update_segment(LogSegment *ls);
void replay(MDSRank *mds, LogSegment *ls, MDPeerUpdate *su=NULL);
void replay(MDSRank *mds, LogSegment *ls, int type, MDPeerUpdate *su=NULL);
};
WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
Expand Down
31 changes: 20 additions & 11 deletions src/mds/journal.cc
Expand Up @@ -1156,7 +1156,7 @@ void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
ls.push_back(new EMetaBlob());
}

void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, int type, MDPeerUpdate *peerup)
{
dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;

Expand Down Expand Up @@ -1560,11 +1560,16 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
logseg->open_files.push_back(&in->item_open_file);
}

bool skip_replaying_inotable = g_conf()->mds_inject_skip_replaying_inotable;

// allocated_inos
if (inotablev) {
if (mds->inotable->get_version() >= inotablev) {
if (mds->inotable->get_version() >= inotablev ||
unlikely(type == EVENT_UPDATE && skip_replaying_inotable)) {
dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
<< " <= table " << mds->inotable->get_version() << dendl;
if (allocated_ino)
mds->mdcache->insert_taken_inos(allocated_ino);
} else {
dout(10) << "EMetaBlob.replay inotable v " << inotablev
<< " - 1 == table " << mds->inotable->get_version()
Expand All @@ -1588,9 +1593,12 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
}
}
if (sessionmapv) {
if (mds->sessionmap.get_version() >= sessionmapv) {
if (mds->sessionmap.get_version() >= sessionmapv ||
unlikely(type == EVENT_UPDATE && skip_replaying_inotable)) {
dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
<< " <= table " << mds->sessionmap.get_version() << dendl;
if (used_preallocated_ino)
mds->mdcache->insert_taken_inos(used_preallocated_ino);
} else {
dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
<< ", table " << mds->sessionmap.get_version()
Expand Down Expand Up @@ -2230,7 +2238,8 @@ void EUpdate::update_segment()
void EUpdate::replay(MDSRank *mds)
{
auto&& segment = get_segment();
metablob.replay(mds, segment);
dout(10) << "EUpdate::replay" << dendl;
metablob.replay(mds, segment, EVENT_UPDATE);

if (had_peers) {
dout(10) << "EUpdate.replay " << reqid << " had peers, expecting a matching ECommitted" << dendl;
Expand Down Expand Up @@ -2313,7 +2322,7 @@ void EOpen::replay(MDSRank *mds)
{
dout(10) << "EOpen.replay " << dendl;
auto&& segment = get_segment();
metablob.replay(mds, segment);
metablob.replay(mds, segment, EVENT_OPEN);

// note which segments inodes belong to, so we don't have to start rejournaling them
for (const auto &ino : inos) {
Expand Down Expand Up @@ -2629,7 +2638,7 @@ void EPeerUpdate::replay(MDSRank *mds)
dout(10) << "EPeerUpdate.replay prepare " << reqid << " for mds." << leader
<< ": applying commit, saving rollback info" << dendl;
su = new MDPeerUpdate(origop, rollback);
commit.replay(mds, segment, su);
commit.replay(mds, segment, EVENT_PEERUPDATE, su);
mds->mdcache->add_uncommitted_peer(reqid, segment, leader, su);
break;

Expand All @@ -2641,7 +2650,7 @@ void EPeerUpdate::replay(MDSRank *mds)
case EPeerUpdate::OP_ROLLBACK:
dout(10) << "EPeerUpdate.replay abort " << reqid << " for mds." << leader
<< ": applying rollback commit blob" << dendl;
commit.replay(mds, segment);
commit.replay(mds, segment, EVENT_PEERUPDATE);
mds->mdcache->finish_uncommitted_peer(reqid, false);
break;

Expand Down Expand Up @@ -2820,7 +2829,7 @@ void ESubtreeMap::replay(MDSRank *mds)

// first, stick the spanning tree in my cache
//metablob.print(*_dout);
metablob.replay(mds, get_segment());
metablob.replay(mds, get_segment(), EVENT_SUBTREEMAP);

// restore import/export maps
for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
Expand Down Expand Up @@ -2895,7 +2904,7 @@ void EFragment::replay(MDSRank *mds)
ceph_abort();
}

metablob.replay(mds, segment);
metablob.replay(mds, segment, EVENT_FRAGMENT);
if (in && g_conf()->mds_debug_frag)
in->verify_dirfrags();
}
Expand Down Expand Up @@ -2979,7 +2988,7 @@ void EExport::replay(MDSRank *mds)
{
dout(10) << "EExport.replay " << base << dendl;
auto&& segment = get_segment();
metablob.replay(mds, segment);
metablob.replay(mds, segment, EVENT_EXPORT);

CDir *dir = mds->mdcache->get_dirfrag(base);
ceph_assert(dir);
Expand Down Expand Up @@ -3058,7 +3067,7 @@ void EImportStart::replay(MDSRank *mds)
dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
//metablob.print(*_dout);
auto&& segment = get_segment();
metablob.replay(mds, segment);
metablob.replay(mds, segment, EVENT_IMPORTSTART);

// put in ambiguous import list
mds->mdcache->add_ambiguous_import(base, bounds);
Expand Down