Skip to content

Commit

Permalink
qa/tasks/cephfs: test damage to dentry's first is caught
Browse files Browse the repository at this point in the history
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
  • Loading branch information
batrick committed Jan 20, 2023
1 parent ac1197d commit ed581bf
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 0 deletions.
3 changes: 3 additions & 0 deletions qa/tasks/cephfs/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,9 @@ def run_scrub(self, cmd, rank=0):
def get_scrub_status(self, rank=0):
return self.run_scrub(["status"], rank)

def flush(self, rank=0):
return self.rank_tell(["flush", "journal"], rank=rank)

def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
timeout=300, reverse=False):
# time out after "timeout" seconds and assume as done
Expand Down
94 changes: 94 additions & 0 deletions qa/tasks/cephfs/test_damage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import errno
import re
import time
from teuthology.contextutil import MaxWhileTries
from teuthology.exceptions import CommandFailedError
from teuthology.orchestra.run import wait
Expand Down Expand Up @@ -562,3 +563,96 @@ def test_open_ino_errors(self):
self.fs.mon_manager.raw_cluster_cmd(
'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
"damage", "rm", str(entry['id']))

def test_dentry_first_existing(self):
"""
That the MDS won't abort when the dentry is already known to be damaged.
"""

def verify_corrupt():
info = self.fs.read_cache("/a", 0)
self.assertEqual(len(info), 1)
dirfrags = info[0]['dirfrags']
self.assertEqual(len(dirfrags), 1)
dentries = dirfrags[0]['dentries']
self.assertEqual(len(dentries), 1)
self.assertEqual(dentries[0]['snap_first'] == 18446744073709551606) # SNAP_HEAD

self.mount_a.run_shell_payload("mkdir -p a/b")
self.fs.flush()
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
self.mount_a.run_shell_payload("mv a/b a/c; sync .")
self.mount_a.umount()
verify_corrupt()
self.fs.fail()
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
self.fs.set_joinable()
status = self.fs.status()
self.fs.flush()
self.assertFalse(self.fs.status().hadfailover(status))
verify_corrupt()

def test_dentry_first_preflush(self):
"""
That the MDS won't write a dentry with new damage to CDentry::first
to the journal.
"""

rank0 = self.fs.get_rank()
self.fs.rank_freeze(True, rank=0)
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
self.fs.flush()
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
self.delete_mds_coredump(rank0['name'])
self.fs.rank_freeze(False, rank=0)
self.fs.wait_for_daemons()
p.wait()
self.mount_a.run_shell_payload("stat a/ && find a/")
self.fs.flush()

def test_dentry_first_precommit(self):
"""
That the MDS won't write a dentry with new damage to CDentry::first
to the directory object.
"""

fscid = self.fs.id
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
self.mount_a.umount() # allow immediate scatter write back
self.fs.flush()
# now just twiddle some inode metadata on a regular file
self.mount_a.mount_wait()
self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
self.mount_a.umount() # avoid journaling session related things
# okay, now cause the dentry to get damaged after loading from the journal
self.fs.fail()
self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
self.fs.set_joinable()
self.fs.wait_for_daemons()
rank0 = self.fs.get_rank()
self.fs.rank_freeze(True, rank=0)
# so now we want to trigger commit but this will crash, so:
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
self.delete_mds_coredump(rank0['name'])
self.fs.rank_freeze(False, rank=0)
self.fs.wait_for_daemons()
try:
p.wait()
except CommandFailedError as e:
print(e)
else:
self.fail("flush journal should fail!")
self.mount_a.mount_wait()
self.mount_a.run_shell_payload("stat a/ && find a/")
self.fs.flush()

0 comments on commit ed581bf

Please sign in to comment.