Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rgw: fix list op raced with put op maybe cause index delete (copy + fix) #41978

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/cls/rgw/cls_rgw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,13 @@ int rgw_dir_suggest_changes(cls_method_context_t hctx,
return -EINVAL;
}

if (cur_disk.pending_map.size() == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that there's already a if (cur_disk.pending_map.empty()) { condition below that prevents dir suggestions from being applied against pending entries

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i understand this PR better after the discussions around #45345

this early return means that we'll only apply suggestions for entries that had pending operations but they expired by tag timeout. the change in #45345 only addresses a race between dir_suggest and the complete, but this change also prevents clients from making suggestions against completed entries

in bucket listing, we have several conditions for calling check_disk_state() - !pending_map.empty() is not the only one. is it possible that we're relying on dir_suggest to work even though pending_map is empty?

/* if none stale pending_map left, should do nothing.
* this can avoid list op get inconsistent state when raced with another op
*/
continue;
}

real_time cur_time = real_clock::now();
auto iter = cur_disk.pending_map.begin();
while(iter != cur_disk.pending_map.end()) {
Expand Down
170 changes: 166 additions & 4 deletions src/test/cls_rgw/test_cls_rgw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,168 @@ TEST_F(cls_rgw, index_suggest)
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_objs / 2, total_size);
}

TEST_F(cls_rgw, index_suggest_recover)
{
// case recover: PUT Op fail to complete,
// index_suggest should recover the index
string bucket_oid = str_int("bucket", 4);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

//int epoch = 0;
uint64_t obj_size = 1024;

uint64_t num_entries = 0;
uint64_t total_size = 0;

string obj = str_int("obj", 1);
string tag = str_int("tag", 1);
string loc = str_int("loc", 1);
index_prepare(ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, obj, loc);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

rgw_bucket_dir_entry dirent;
dirent.key.name = obj;
dirent.locator = loc;
dirent.exists = true;
dirent.meta.size = 1024;
dirent.meta.accounted_size = 1024;

bufferlist updates;
cls_rgw_encode_suggestion(CEPH_RGW_UPDATE, dirent, updates);
num_entries += 1;
total_size += obj_size;

map<int, string> bucket_objs;
bucket_objs[0] = bucket_oid;
int r = CLSRGWIssueSetTagTimeout(ioctx, bucket_objs, 8 /* max aio */, 1)();
ASSERT_EQ(0, r);

sleep(1);

/* suggest changes! */
cls_rgw_suggest_changes(op, updates);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);
}

TEST_F(cls_rgw, index_suggest_race_put)
{
// case race put: list op raced with put op,
// index_suggest should not delete index
string bucket_oid = str_int("bucket", 5);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

int epoch = 0;
uint64_t obj_size = 1024;

uint64_t num_entries = 0;
uint64_t total_size = 0;

string obj = str_int("obj", 1);
string tag = str_int("tag", 1);
string loc = str_int("loc", 1);

// PUT op
index_prepare(ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, obj, loc);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

rgw_bucket_dir_entry_meta meta;
meta.category = RGWObjCategory::None;
meta.size = obj_size;
total_size += meta.size;
num_entries += 1;

index_complete(ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, ++epoch, obj, meta);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

/*
* 1.list after index prepare
* 2.check_disk_state before write head obj
* 3.aio_operate dir_suggest_changes after index complete
*/
rgw_bucket_dir_entry dirent;

bufferlist updates;
cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, dirent, updates);

/* suggest changes! */
cls_rgw_suggest_changes(op, updates);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);
}

TEST_F(cls_rgw, index_suggest_race_delete)
{
// case race delete: list op raced with del op,
// index_suggest should not recover put index
string bucket_oid = str_int("bucket", 6);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

int epoch = 0;
uint64_t obj_size = 1024;

uint64_t num_entries = 0;
uint64_t total_size = 0;

string obj = str_int("obj", 1);
string tag = str_int("tag", 1);
string loc = str_int("loc", 1);

// first PUT op
index_prepare(ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, obj, loc);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

rgw_bucket_dir_entry_meta meta;
meta.category = RGWObjCategory::None;
meta.size = obj_size;
total_size += meta.size;
num_entries += 1;

index_complete(ioctx, bucket_oid, CLS_RGW_OP_ADD, tag, ++epoch, obj, meta);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

// second DEL op
index_prepare(ioctx, bucket_oid, CLS_RGW_OP_DEL, tag, obj, loc);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

rgw_bucket_dir_entry_meta meta2;
num_entries -= 1;
total_size -= obj_size;

index_complete(ioctx, bucket_oid, CLS_RGW_OP_DEL, tag, ++epoch, obj, meta2);
test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);

/* 1.list after del op index prepare
* 2.check_disk_state before delete head obj
* 3.aio_operate dir_suggest_changes after index complete
*/
rgw_bucket_dir_entry dirent;
dirent.key.name = obj;
dirent.locator = loc;
dirent.exists = true;
dirent.meta.size = 1024;
dirent.meta.accounted_size = 1024;

bufferlist updates;
cls_rgw_encode_suggestion(CEPH_RGW_UPDATE, dirent, updates);

/* suggest changes! */
cls_rgw_suggest_changes(op, updates);
ASSERT_EQ(0, ioctx.operate(bucket_oid, &op));

test_stats(ioctx, bucket_oid, RGWObjCategory::None, num_entries, total_size);
}

/*
* This case is used to test whether get_obj_vals will
Expand All @@ -374,7 +536,7 @@ TEST_F(cls_rgw, index_suggest)
*/
TEST_F(cls_rgw, index_list)
{
string bucket_oid = str_int("bucket", 4);
string bucket_oid = str_int("bucket", 7);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
Expand Down Expand Up @@ -450,7 +612,7 @@ TEST_F(cls_rgw, index_list)
*/
TEST_F(cls_rgw, index_list_delimited)
{
string bucket_oid = str_int("bucket", 7);
string bucket_oid = str_int("bucket", 8);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
Expand Down Expand Up @@ -553,7 +715,7 @@ TEST_F(cls_rgw, index_list_delimited)

TEST_F(cls_rgw, bi_list)
{
string bucket_oid = str_int("bucket", 5);
string bucket_oid = str_int("bucket", 9);

CephContext *cct = reinterpret_cast<CephContext *>(ioctx.cct());

Expand Down Expand Up @@ -1050,7 +1212,7 @@ static int bilog_trim(librados::IoCtx& ioctx, const std::string& oid,

TEST_F(cls_rgw, bi_log_trim)
{
string bucket_oid = str_int("bucket", 6);
string bucket_oid = str_int("bucket", 10);

ObjectWriteOperation op;
cls_rgw_bucket_init_index(op);
Expand Down