Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mds: multiple mds scrub support #35749

Merged
merged 19 commits into from Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
afb2e12
mds: remove ScrubStack::scrubstack
ukernel May 3, 2020
547c1ed
mds: make both CInode and CDir as entities of scrub
ukernel May 2, 2020
b43af15
mds: change scrub traverse from post-order to breadth-first search
simon-rock May 15, 2020
a4c9bfc
mds: prevent dirfrag scrub/fragment from running at the same time
ukernel May 20, 2020
38c6fc8
mds: remove object can't be scrubbed immediately from scrub stack
ukernel May 22, 2020
458e7d6
include/frag: add encode/decode functions for fragset_t
ukernel May 23, 2020
e5c4e8c
mds: multiple mds scrub support
simon-rock May 28, 2020
c8c3ba4
mds: rdlock file/nest lock when accumulating stats of subtree dirfrags
simon-rock Jun 3, 2020
f01ebad
mds: auth pin CInode when validating its disk state
ukernel Jun 3, 2020
86fb5b4
Continuation: don't delete self while there are in-processing stages
ukernel Jun 10, 2020
03908aa
mds: remove on_finish from {CInode,CDir}::scrub_info_t
ukernel Jun 19, 2020
ad5471f
mds: track scrub status in multiple mds
ukernel Jun 19, 2020
ff10bdb
mds: abort/pause/resume scrubs in multiple mds
ukernel Jun 23, 2020
fbac6bc
mds: don't skip validating disk state of symlink
ukernel Jun 24, 2020
31deca1
qa/cephfs: update existing scrub test cases
ukernel Jun 29, 2020
904c959
qa/cephfs: add tests for multimds scrub
ukernel Jul 1, 2020
ba9b85c
qa/cephfs: Add more tests for multimds scrub
sidharthanup Aug 25, 2020
c42570a
qa/cephfs: log-ignorelist scrub errors
ukernel Nov 16, 2020
11a1997
Update Release notes for multimds scrub
ukernel Nov 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions PendingReleaseNotes
Expand Up @@ -183,6 +183,9 @@
ceph daemon <mds of rank 0> scrub_path / force recursive repair
ceph daemon <mds of rank 0> scrub_path '~mdsdir' force recursive repair

* CephFS: Scrub is supported in multiple active mds setup. MDS rank 0 handles
scrub commands, and forward scrub to other mds if nessesary.

* The following librados API calls have changed:

- ``rados_blacklist_add`` is now ``rados_blocklist_add``; the former will issue a deprecation warning and be removed in a future release.
Expand Down
9 changes: 9 additions & 0 deletions qa/suites/multimds/basic/tasks/cephfs_test_multimds_misc.yaml
@@ -0,0 +1,9 @@
overrides:
ceph:
log-ignorelist:
- Scrub error on inode

tasks:
- cephfs_test_runner:
modules:
- tasks.cephfs.test_multimds_misc
228 changes: 228 additions & 0 deletions qa/tasks/cephfs/test_multimds_misc.py
@@ -0,0 +1,228 @@
import logging
import errno
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.contextutil import safe_while
from teuthology.orchestra.run import CommandFailedError

log = logging.getLogger(__name__)

class TestScrub2(CephFSTestCase):
MDSS_REQUIRED = 3
CLIENTS_REQUIRED = 1

def _get_scrub_status(self, rank=0):
return self.fs.rank_tell(["scrub", "status"], rank)

def _wait_until_scrubbed(self, timeout):
self.wait_until_true(lambda: "no active" in self._get_scrub_status()['status'], timeout)

def _check_task_status_na(self, timo=120):
""" check absence of scrub status in ceph status """
with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
while proceed():
active = self.fs.get_active_names()
log.debug("current active={0}".format(active))
task_status = self.fs.get_task_status("scrub status")
if not active[0] in task_status:
return True

def _check_task_status(self, expected_status, timo=120):
""" check scrub status for current active mds in ceph status """
with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
while proceed():
active = self.fs.get_active_names()
log.debug("current active={0}".format(active))
task_status = self.fs.get_task_status("scrub status")
try:
if task_status[active[0]].startswith(expected_status):
return True
except KeyError:
pass

def _find_path_inos(self, root_path):
inos = []
p = self.mount_a.run_shell(["find", root_path])
paths = p.stdout.getvalue().strip().split()
for path in paths:
inos.append(self.mount_a.path_to_ino(path))
return inos

def _setup_subtrees(self):
self.fs.set_max_mds(3)
self.fs.wait_for_daemons()
status = self.fs.status()

path = 'd1/d2/d3/d4/d5/d6/d7/d8'
self.mount_a.run_shell(['mkdir', '-p', path])
self.mount_a.run_shell(['sync', path])

self.mount_a.setfattr("d1/d2", "ceph.dir.pin", "0")
self.mount_a.setfattr("d1/d2/d3/d4", "ceph.dir.pin", "1")
self.mount_a.setfattr("d1/d2/d3/d4/d5/d6", "ceph.dir.pin", "2")

self._wait_subtrees([('/d1/d2', 0), ('/d1/d2/d3/d4', 1)], status, 0)
self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 1)
self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 2)

for rank in range(3):
self.fs.rank_tell(["flush", "journal"], rank)

def test_apply_tag(self):
self._setup_subtrees()
inos = self._find_path_inos('d1/d2/d3/')

tag = "tag123"
self.fs.rank_tell(["tag", "path", "/d1/d2/d3", tag], 0)
self._wait_until_scrubbed(30);

def assertTagged(ino):
file_obj_name = "{0:x}.00000000".format(ino)
self.fs.rados(["getxattr", file_obj_name, "scrub_tag"])

for ino in inos:
assertTagged(ino)

def test_scrub_backtrace(self):
self._setup_subtrees()
inos = self._find_path_inos('d1/d2/d3/')

for ino in inos:
file_obj_name = "{0:x}.00000000".format(ino)
self.fs.rados(["rmxattr", file_obj_name, "parent"])

self.fs.rank_tell(["scrub", "start", "/d1/d2/d3", "recursive", "force"], 0)
self._wait_until_scrubbed(30);

def _check_damage(mds_rank, inos):
all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank)
damage = [d for d in all_damage if d['ino'] in inos and d['damage_type'] == "backtrace"]
return len(damage) >= len(inos)

self.assertTrue(_check_damage(0, inos[0:2]))
self.assertTrue(_check_damage(1, inos[2:4]))
self.assertTrue(_check_damage(2, inos[4:6]))

def test_scrub_non_mds0(self):
self._setup_subtrees()

def expect_exdev(cmd, mds):
try:
self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(mds), *cmd)
except CommandFailedError as e:
if e.exitstatus == errno.EXDEV:
pass
else:
raise
else:
raise RuntimeError("expected failure")

rank1 = self.fs.get_rank(rank=1)
sidharthanup marked this conversation as resolved.
Show resolved Hide resolved
expect_exdev(["scrub", "start", "/d1/d2/d3"], rank1["name"])
expect_exdev(["scrub", "abort"], rank1["name"])
expect_exdev(["scrub", "pause"], rank1["name"])
expect_exdev(["scrub", "resume"], rank1["name"])

def test_scrub_abort_mds0(self):
self._setup_subtrees()

inos = self._find_path_inos('d1/d2/d3/')

for ino in inos:
file_obj_name = "{0:x}.00000000".format(ino)
self.fs.rados(["rmxattr", file_obj_name, "parent"])

out_json = self.fs.rank_tell(["scrub", "start", "/d1/d2/d3", "recursive", "force"], 0)
self.assertNotEqual(out_json, None)

res = self.fs.rank_tell(["scrub", "abort"])
self.assertEqual(res['return_code'], 0)

# Abort and verify in both mdss. We also check the status in rank 0 mds because
# it is supposed to gather the scrub status from other mdss.
self.wait_until_true(lambda: "no active" in self._get_scrub_status(1)['status']
and "no active" in self._get_scrub_status(2)['status']
and "no active" in self._get_scrub_status(0)['status'], 30)

# sleep enough to fetch updated task status
checked = self._check_task_status_na()
self.assertTrue(checked)

def test_scrub_pause_and_resume_mds0(self):
self._setup_subtrees()

inos = self._find_path_inos('d1/d2/d3/')

for ino in inos:
file_obj_name = "{0:x}.00000000".format(ino)
self.fs.rados(["rmxattr", file_obj_name, "parent"])

out_json = self.fs.rank_tell(["scrub", "start", "/d1/d2/d3", "recursive", "force"], 0)
self.assertNotEqual(out_json, None)

res = self.fs.rank_tell(["scrub", "pause"])
self.assertEqual(res['return_code'], 0)

self.wait_until_true(lambda: "PAUSED" in self._get_scrub_status(1)['status']
and "PAUSED" in self._get_scrub_status(2)['status']
and "PAUSED" in self._get_scrub_status(0)['status'], 30)

checked = self._check_task_status("paused")
self.assertTrue(checked)

# resume and verify
res = self.fs.rank_tell(["scrub", "resume"])
self.assertEqual(res['return_code'], 0)

self.wait_until_true(lambda: not("PAUSED" in self._get_scrub_status(1)['status'])
and not("PAUSED" in self._get_scrub_status(2)['status'])
and not("PAUSED" in self._get_scrub_status(0)['status']), 30)

checked = self._check_task_status_na()
self.assertTrue(checked)

def test_scrub_pause_and_resume_with_abort_mds0(self):
self._setup_subtrees()

inos = self._find_path_inos('d1/d2/d3/')

for ino in inos:
file_obj_name = "{0:x}.00000000".format(ino)
self.fs.rados(["rmxattr", file_obj_name, "parent"])

out_json = self.fs.rank_tell(["scrub", "start", "/d1/d2/d3", "recursive", "force"], 0)
self.assertNotEqual(out_json, None)

res = self.fs.rank_tell(["scrub", "pause"])
self.assertEqual(res['return_code'], 0)

self.wait_until_true(lambda: "PAUSED" in self._get_scrub_status(1)['status']
and "PAUSED" in self._get_scrub_status(2)['status']
and "PAUSED" in self._get_scrub_status(0)['status'], 30)

checked = self._check_task_status("paused")
self.assertTrue(checked)

res = self.fs.rank_tell(["scrub", "abort"])
self.assertEqual(res['return_code'], 0)

self.wait_until_true(lambda: "PAUSED" in self._get_scrub_status(1)['status']
and "0 inodes" in self._get_scrub_status(1)['status']
and "PAUSED" in self._get_scrub_status(2)['status']
and "0 inodes" in self._get_scrub_status(2)['status']
and "PAUSED" in self._get_scrub_status(0)['status']
and "0 inodes" in self._get_scrub_status(0)['status'], 30)

# scrub status should still be paused...
checked = self._check_task_status("paused")
self.assertTrue(checked)

# resume and verify
res = self.fs.rank_tell(["scrub", "resume"])
self.assertEqual(res['return_code'], 0)

self.wait_until_true(lambda: not("PAUSED" in self._get_scrub_status(1)['status'])
and not("PAUSED" in self._get_scrub_status(2)['status'])
and not("PAUSED" in self._get_scrub_status(0)['status']), 30)

checked = self._check_task_status_na()
self.assertTrue(checked)
35 changes: 18 additions & 17 deletions qa/tasks/cephfs/test_scrub_checks.py
Expand Up @@ -77,8 +77,7 @@ def test_scrub_abort(self):

# abort and verify
self._abort_scrub(0)
out_json = self._get_scrub_status()
self.assertTrue("no active" in out_json['status'])
self.wait_until_true(lambda: "no active" in self._get_scrub_status()['status'], 30)

# sleep enough to fetch updated task status
checked = self._check_task_status_na()
Expand Down Expand Up @@ -255,13 +254,6 @@ def _checks(self, run_seq):
command = "scrub start {filepath}".format(filepath=filepath)
self.tell_command(mds_rank, command, success_validator)

filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \
format(repo_path=test_repo_path)
command = "scrub start {filepath}".format(filepath=filepath)
self.tell_command(mds_rank, command,
lambda j, r: self.json_validator(j, r, "performed_validation",
False))

if run_seq == 0:
log.info("First run: flushing base dir /")
command = "flush_path /"
Expand Down Expand Up @@ -290,15 +282,24 @@ def _checks(self, run_seq):
rados_obj_name = "{ino:x}.00000000".format(ino=ino)
command = "scrub start {file}".format(file=test_new_file)

# Missing parent xattr -> ENODATA
self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
self.tell_command(mds_rank, command,
lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA))
def _get_scrub_status():
return self.fs.rank_tell(["scrub", "status"], mds_rank)

# Missing object -> ENOENT
self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name())
self.tell_command(mds_rank, command,
lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
def _check_and_clear_damage(ino, dtype):
all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank)
damage = [d for d in all_damage if d['ino'] == ino and d['damage_type'] == dtype]
for d in damage:
self.fs.mon_manager.raw_cluster_cmd(
'tell', 'mds.{0}'.format(self.fs.get_active_names()[mds_rank]),
"damage", "rm", str(d['id']))
return len(damage) > 0

# Missing parent xattr
self.assertFalse(_check_and_clear_damage(ino, "backtrace"));
self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
self.tell_command(mds_rank, command, success_validator)
self.wait_until_true(lambda: "no active" in _get_scrub_status()['status'], 30)
self.assertTrue(_check_and_clear_damage(ino, "backtrace"));

command = "flush_path /"
self.asok_command(mds_rank, command, success_validator)
Expand Down
3 changes: 1 addition & 2 deletions src/common/Continuation.h
Expand Up @@ -144,8 +144,7 @@ class Continuation {
assert (!done ||
stages_in_flight.size() == stages_processing.size());

if (done ||
(reported_done && stages_processing.empty())) {
if ((done || reported_done) && stages_processing.empty()) {
_done();
delete this;
}
Expand Down
2 changes: 2 additions & 0 deletions src/common/strtol.h
Expand Up @@ -134,8 +134,10 @@ auto consume(std::string_view& sv, int base = 10)

bool strict_strtob(const char* str, std::string *err);

long long strict_strtoll(std::string_view str, int base, std::string *err);
long long strict_strtoll(const char *str, int base, std::string *err);

int strict_strtol(std::string_view str, int base, std::string *err);
int strict_strtol(const char *str, int base, std::string *err);

double strict_strtod(const char *str, std::string *err);
Expand Down
1 change: 1 addition & 0 deletions src/include/ceph_fs.h
Expand Up @@ -415,6 +415,7 @@ enum {
CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503,
CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504,
CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505,
CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507
};

extern const char *ceph_mds_op_name(int op);
Expand Down
20 changes: 16 additions & 4 deletions src/include/frag.h
Expand Up @@ -169,6 +169,7 @@ class frag_t {
private:
_frag_t _enc = 0;
};
WRITE_CLASS_ENCODER(frag_t)

inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
{
Expand All @@ -182,8 +183,6 @@ inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
return out << '*';
}

inline void encode(const frag_t &f, ceph::buffer::list& bl) { f.encode(bl); }
inline void decode(frag_t &f, ceph::buffer::list::const_iterator& p) { f.decode(p); }

using frag_vec_t = boost::container::small_vector<frag_t, 4>;

Expand Down Expand Up @@ -558,8 +557,8 @@ class fragset_t {

public:
const std::set<frag_t> &get() const { return _set; }
std::set<frag_t>::iterator begin() { return _set.begin(); }
std::set<frag_t>::iterator end() { return _set.end(); }
std::set<frag_t>::const_iterator begin() const { return _set.begin(); }
std::set<frag_t>::const_iterator end() const { return _set.end(); }

bool empty() const { return _set.empty(); }

Expand All @@ -571,6 +570,10 @@ class fragset_t {
}
}

void clear() {
_set.clear();
}

void insert_raw(frag_t f){
_set.insert(f);
}
Expand All @@ -593,7 +596,16 @@ class fragset_t {
}
}
}

void encode(ceph::buffer::list& bl) const {
ceph::encode(_set, bl);
}
void decode(ceph::buffer::list::const_iterator& p) {
ceph::decode(_set, p);
}
};
WRITE_CLASS_ENCODER(fragset_t)


inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
{
Expand Down