Skip to content

Commit

Permalink
mds: add truncate size handling support for fscrypt
Browse files Browse the repository at this point in the history
When the fscrypt is enabled and when truncating a smaller size, the
old size and new size will always rounded up to CEPH_FSCRYPT_BLOCK_SIZE,
which is 4K for now. For example if truncating file size from 3KB
to 2KB, the MDS will always get old_size == new_size == 4KB. But
only in this case will the truncate request pass the encrypted last
block, we can check this and if the length is non-zero we can be
sure that it's truncating to smaller size.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
  • Loading branch information
lxbsz committed Nov 17, 2021
1 parent 4f9c1ad commit 34eafd9
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 40 deletions.
10 changes: 10 additions & 0 deletions src/common/options/mds.yaml.in
Expand Up @@ -11,6 +11,16 @@ options:
- mds
flags:
- runtime
- name: mds_fscrypt_last_block_max_size
type: size
level: advanced
desc: maximum size of the last block without the header along with a truncate
request when the fscrypt is enabled.
default: 4_K
services:
- mds
flags:
- runtime
- name: mds_valgrind_exit
type: bool
level: dev
Expand Down
131 changes: 103 additions & 28 deletions src/mds/MDCache.cc
Expand Up @@ -63,6 +63,7 @@
#include "events/ESessions.h"

#include "InoTable.h"
#include "fscrypt.h"

#include "common/Timer.h"

Expand Down Expand Up @@ -102,27 +103,6 @@ class MDCacheContext : public virtual MDSContext {
explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
};


/**
* Only for contexts called back from an I/O completion
*
* Note: duplication of members wrt MDCacheContext, because
* it'ls the lesser of two evils compared with introducing
* yet another piece of (multiple) inheritance.
*/
class MDCacheIOContext : public virtual MDSIOContextBase {
protected:
MDCache *mdcache;
MDSRank *get_mds() override
{
ceph_assert(mdcache != NULL);
return mdcache->mds;
}
public:
explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
MDSIOContextBase(track), mdcache(mdc_) {}
};

class MDCacheLogContext : public virtual MDSLogContextBase {
protected:
MDCache *mdcache;
Expand Down Expand Up @@ -6430,6 +6410,22 @@ void MDCache::truncate_inode(CInode *in, LogSegment *ls)
_truncate_inode(in, ls);
}

struct C_IO_MDC_TruncateWriteFinish : public MDCacheIOContext {
CInode *in;
LogSegment *ls;
uint32_t block_size;
C_IO_MDC_TruncateWriteFinish(MDCache *c, CInode *i, LogSegment *l, uint32_t bs) :
MDCacheIOContext(c, false), in(i), ls(l), block_size(bs) {
}
void finish(int r) override {
ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
mdcache->truncate_inode_write_finish(in, ls, block_size);
}
void print(ostream& out) const override {
out << "file_truncate_write(" << in->ino() << ")";
}
};

struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
CInode *in;
LogSegment *ls;
Expand All @@ -6455,7 +6451,9 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
ceph_assert(pi->is_truncating());
ceph_assert(pi->truncate_size < (1ULL << 63));
ceph_assert(pi->truncate_from < (1ULL << 63));
ceph_assert(pi->truncate_size < pi->truncate_from);
ceph_assert(pi->truncate_size < pi->truncate_from ||
(pi->truncate_size == pi->truncate_from &&
pi->fscrypt_last_block.length()));


SnapRealm *realm = in->find_snaprealm();
Expand All @@ -6469,13 +6467,52 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
snapc = &nullsnap;
ceph_assert(in->last == CEPH_NOSNAP);
}
dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
dout(10) << "_truncate_inode snapc " << snapc << " on " << *in
<< " fscrypt_last_block length is " << pi->fscrypt_last_block.length()
<< dendl;
auto layout = pi->layout;
filer.truncate(in->ino(), &layout, *snapc,
pi->truncate_size, pi->truncate_from-pi->truncate_size,
pi->truncate_seq, ceph::real_time::min(), 0,
new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
mds->finisher));
struct ceph_fscrypt_last_block_header header;
memset(&header, 0, sizeof(header));
bufferlist data;
if (pi->fscrypt_last_block.length()) {
auto bl = pi->fscrypt_last_block.cbegin();
DECODE_START(1, bl);
decode(header.objver, bl);
decode(header.file_offset, bl);
decode(header.block_size, bl);
bl.copy(header.block_size, data);
DECODE_FINISH(bl);
}

/*
* If the block_size is 0, that means the fscrypt_last_block is empty
* or there has no data need to write in fscrypt_last_block, which
* means the truncate size is located in the file hole.
*/
if (header.block_size) {
dout(10) << "_truncate_inode write on inode " << *in << " objver: "
<< header.objver << " offset: " << header.file_offset << " blen: "
<< header.block_size << dendl;
filer.write(in->ino(), &layout, *snapc, header.file_offset, header.block_size,
data, ceph::real_time::min(), 0,
new C_OnFinisher(new C_IO_MDC_TruncateWriteFinish(this, in, ls,
header.block_size),
mds->finisher));
#if 0
filer.write_trunc(in->ino(), &layout, *snapc, header.file_offset, header.block_size,
data, ceph::real_time::min(), 0, pi->truncate_size, pi->truncate_seq - 1,
new C_OnFinisher(new C_IO_MDC_TruncateFinish1(this, in, ls),
mds->finisher));
#endif
} else {
dout(10) << "_truncate_inode truncate on inode " << *in << dendl;
filer.truncate(in->ino(), &layout, *snapc,
pi->truncate_size, pi->truncate_from-pi->truncate_size,
pi->truncate_seq, ceph::real_time::min(), 0,
new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
mds->finisher));
}

}

struct C_MDC_TruncateLogged : public MDCacheLogContext {
Expand All @@ -6488,6 +6525,44 @@ struct C_MDC_TruncateLogged : public MDCacheLogContext {
}
};

void MDCache::truncate_inode_write_finish(CInode *in, LogSegment *ls,
uint32_t block_size)
{
const auto& pi = in->get_inode();
dout(10) << "_truncate_inode_write "
<< pi->truncate_from << " -> " << pi->truncate_size
<< " on " << *in << dendl;

ceph_assert(pi->is_truncating());
ceph_assert(pi->truncate_size < (1ULL << 63));
ceph_assert(pi->truncate_from < (1ULL << 63));
ceph_assert(pi->truncate_size < pi->truncate_from ||
(pi->truncate_size == pi->truncate_from &&
pi->fscrypt_last_block.length()));


SnapRealm *realm = in->find_snaprealm();
SnapContext nullsnap;
const SnapContext *snapc;
if (realm) {
dout(10) << " realm " << *realm << dendl;
snapc = &realm->get_snap_context();
} else {
dout(10) << " NO realm, using null context" << dendl;
snapc = &nullsnap;
ceph_assert(in->last == CEPH_NOSNAP);
}
dout(10) << "_truncate_inode_write snapc " << snapc << " on " << *in
<< " fscrypt_last_block length is " << pi->fscrypt_last_block.length()
<< dendl;
auto layout = pi->layout;
uint64_t length = pi->truncate_from - pi->truncate_size + block_size;
filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length,
pi->truncate_seq, ceph::real_time::min(), 0,
new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
mds->finisher));
}

void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
{
dout(10) << "truncate_inode_finish " << *in << dendl;
Expand Down
22 changes: 22 additions & 0 deletions src/mds/MDCache.h
Expand Up @@ -730,6 +730,8 @@ class MDCache {
void truncate_inode(CInode *in, LogSegment *ls);
void _truncate_inode(CInode *in, LogSegment *ls);
void truncate_inode_finish(CInode *in, LogSegment *ls);
void truncate_inode_write_finish(CInode *in, LogSegment *ls,
uint32_t block_size);
void truncate_inode_logged(CInode *in, MutationRef& mut);

void add_recovered_truncate(CInode *in, LogSegment *ls);
Expand Down Expand Up @@ -1362,4 +1364,24 @@ class CF_MDS_RetryRequestFactory : public MDSContextFactory {
bool drop_locks;
};

/**
* Only for contexts called back from an I/O completion
*
* Note: duplication of members wrt MDCacheContext, because
* it'ls the lesser of two evils compared with introducing
* yet another piece of (multiple) inheritance.
*/
class MDCacheIOContext : public virtual MDSIOContextBase {
protected:
MDCache *mdcache;
MDSRank *get_mds() override
{
ceph_assert(mdcache != NULL);
return mdcache->mds;
}
public:
explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
MDSIOContextBase(track), mdcache(mdc_) {}
};

#endif
3 changes: 3 additions & 0 deletions src/mds/Mutation.h
Expand Up @@ -260,6 +260,9 @@ struct MutationImpl : public TrackedOp {
bool aborted = false;
bool killed = false;

bool fscrypt_verifing_objver = false;
uint64_t objver = 0;

// for applying projected inode changes
std::set<MDSCacheObject*> projected_nodes;
std::list<ScatterLock*> updated_locks;
Expand Down
102 changes: 100 additions & 2 deletions src/mds/Server.cc
Expand Up @@ -50,6 +50,7 @@
#include "common/perf_counters.h"
#include "include/compat.h"
#include "osd/OSDMap.h"
#include "fscrypt.h"

#include <errno.h>

Expand Down Expand Up @@ -1259,6 +1260,9 @@ void Server::handle_conf_change(const std::set<std::string>& changed) {
if (changed.count("mds_alternate_name_max")) {
alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
}
if (changed.count("mds_fscrypt_last_block_max_size")) {
fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
}
if (changed.count("mds_dir_max_entries")) {
dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
dout(20) << __func__ << " max entries per directory changed to "
Expand Down Expand Up @@ -5010,6 +5014,29 @@ void Server::handle_client_file_readlock(MDRequestRef& mdr)
respond_to_request(mdr, 0);
}

struct C_IO_MDC_ReadtruncFinish : public MDCacheIOContext {
CInode *in;
MDRequestRef mdr;
bufferlist *data;

C_IO_MDC_ReadtruncFinish(MDCache *c, CInode *i, MDRequestRef& m, bufferlist *d) :
MDCacheIOContext(c, false), in(i), mdr(m), data(d) {
}
void finish(int r) override {
auto mds = get_mds();
if (r < 0 && r != -CEPHFS_ENOENT && r != -EOVERFLOW) {
dout(0) << "C_IO_MDC_ReadtruncFinish r = " << r << dendl;
}
ceph_assert(r >= 0 || r == -CEPHFS_ENOENT || r == -EOVERFLOW);
mdr->retry++;
delete data;
mdcache->dispatch_request(mdr);
}
void print(ostream& out) const override {
out << "read_trunc(" << in->ino() << ")";
}
};

void Server::handle_client_setattr(MDRequestRef& mdr)
{
const cref_t<MClientRequest> &req = mdr->client_request;
Expand All @@ -5029,6 +5056,9 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
__u32 mask = req->head.args.setattr.mask;
__u32 access_mask = MAY_WRITE;

if (mdr->fscrypt_verifing_objver)
goto xlock_done;

if (req->get_header().version < 6) {
// No changes to fscrypted inodes by downrevved clients
if (!cur->get_inode()->fscrypt_auth.empty()) {
Expand All @@ -5054,6 +5084,8 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
if (!mds->locker->acquire_locks(mdr, lov))
return;

xlock_done:

if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
access_mask |= MAY_CHOWN;

Expand All @@ -5077,7 +5109,15 @@ void Server::handle_client_setattr(MDRequestRef& mdr)

bool truncating_smaller = false;
if (mask & CEPH_SETATTR_SIZE) {
truncating_smaller = req->head.args.setattr.size < old_size;
if (req->get_data().length() >
sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) {
dout(10) << __func__ << ": the last block size is too large" << dendl;
respond_to_request(mdr, -CEPHFS_EINVAL);
return;
}

truncating_smaller = req->head.args.setattr.size < old_size ||
(req->head.args.setattr.size == old_size && req->get_data().length());
if (truncating_smaller && pip->is_truncating()) {
dout(10) << " waiting for pending truncate from " << pip->truncate_from
<< " to " << pip->truncate_size << " to complete on " << *cur << dendl;
Expand All @@ -5086,6 +5126,64 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
return;
}

if (truncating_smaller && req->get_data().length()) {
struct ceph_fscrypt_last_block_header header;
memset(&header, 0, sizeof(header));
auto bl = req->get_data().cbegin();
bufferlist *data = new bufferlist();
DECODE_START(1, bl);
decode(header.objver, bl);
decode(header.file_offset, bl);
decode(header.block_size, bl);
DECODE_FINISH(bl);

if (!header.block_size) {
dout(20) << __func__ << " The last block is located in a file hole!"
<< dendl;
} else if (mdr->objver) {
if (mdr->objver != header.objver) {
dout(0) << __func__ << ": header.objver:" << header.objver
<< " != current objver:" << mdr->objver
<< ", let client retry it!" << dendl;
respond_to_request(mdr, -CEPHFS_EAGAIN);
return;
}
mdr->fscrypt_verifing_objver = false;
} else {
mdr->fscrypt_verifing_objver = true;

dout(20) << __func__ << " mdr->objver:" << mdr->objver
<< " mdr->retry:" << mdr->retry
<< " header.objver: " << header.objver
<< " header.file_offset: " << header.file_offset
<< " header.block_size: " << header.block_size
<< dendl;

auto layout = pip->layout;
/*
* Try to get the object version after acquied the xlock for
* filelock. Will not drop the locks when trying this to ban
* any client to update this object in Rados.
*
* We will be sure that the extents.size() will be 1, because
* kclient will forbid the fscrypt block size to be larger
* than an object's size. And also one block won't cross two
* different objects.
*/
std::vector<ObjectExtent> extents;
Striper::file_to_extents(g_ceph_context, cur->ino(), &layout, header.file_offset,
header.block_size, pip->truncate_size, extents);
ceph_assert(extents.size() == 1);
mds->objecter->read_trunc(extents[0].oid, object_locator_t(layout.pool_id),
header.file_offset, (uint64_t)8, mdr->snapid, data,
0, pip->truncate_size, pip->truncate_seq,
new C_OnFinisher(new C_IO_MDC_ReadtruncFinish(mdcache, cur, mdr, data),
mds->finisher),
&mdr->objver);
return;
}
}
}

bool changed_ranges = false;
Expand Down Expand Up @@ -5120,7 +5218,7 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
if (mask & CEPH_SETATTR_SIZE) {
if (truncating_smaller) {
pi.inode->truncate(old_size, req->head.args.setattr.size);
pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data());
le->metablob.add_truncate_start(cur->ino());
} else {
pi.inode->size = req->head.args.setattr.size;
Expand Down

0 comments on commit 34eafd9

Please sign in to comment.