Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pacific: rgw multisite: metadata sync treats all errors as 'transient' for retry #42656

Merged
merged 3 commits into from Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions qa/suites/rgw/multisite/overrides.yaml
Expand Up @@ -14,5 +14,6 @@ overrides:
rgw md log max shards: 4
rgw data log num shards: 4
rgw sync obj etag verify: true
rgw sync meta inject err probability: 0.1
rgw:
compression type: random
33 changes: 21 additions & 12 deletions src/rgw/rgw_sync.cc
Expand Up @@ -1297,7 +1297,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
break;
}

if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl;
continue;
}
Expand All @@ -1322,7 +1322,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
tn->log(10, SSTR("removing local metadata entry"));
yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
}
if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
ldpp_dout(dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl;
continue;
}
Expand Down Expand Up @@ -1394,6 +1394,8 @@ class RGWCloneMetaLogCoroutine : public RGWCoroutine {
int state_store_mdlog_entries_complete();
};

#define META_SYNC_SPAWN_WINDOW 20

class RGWMetaSyncShardCR : public RGWCoroutine {
RGWMetaSyncEnv *sync_env;

Expand Down Expand Up @@ -1514,20 +1516,17 @@ class RGWMetaSyncShardCR : public RGWCoroutine {

if (child_ret < 0) {
ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
// on any error code from RGWMetaSyncSingleEntryCR, we do not advance
// the sync status marker past this entry, and set
// can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
// RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
// previous marker and retry
can_adjust_marker = false;
}

map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
ceph_assert(prev_iter != pos_to_prev.end());

/*
* we should get -EAGAIN for transient errors, for which we want to retry, so we don't
* update the marker and abort. We'll get called again for these. Permanent errors will be
* handled by marking the entry at the error log shard, so that we retry on it separately
*/
if (child_ret == -EAGAIN) {
can_adjust_marker = false;
}

if (pos_to_prev.size() == 1) {
if (can_adjust_marker) {
sync_marker.marker = pos;
Expand Down Expand Up @@ -1626,6 +1625,11 @@ class RGWMetaSyncShardCR : public RGWCoroutine {
stack_to_pos[stack] = marker;
pos_to_prev[marker] = marker;
}
// limit spawn window
while (num_spawned() > META_SYNC_SPAWN_WINDOW) {
yield wait_for_child();
collect_children();
}
}
}
collect_children();
Expand Down Expand Up @@ -1814,6 +1818,11 @@ class RGWMetaSyncShardCR : public RGWCoroutine {
stack_to_pos[stack] = log_iter->id;
pos_to_prev[log_iter->id] = marker;
}
// limit spawn window
while (num_spawned() > META_SYNC_SPAWN_WINDOW) {
yield wait_for_child();
collect_children();
}
}
marker = log_iter->id;
}
Expand Down Expand Up @@ -2236,7 +2245,7 @@ int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
case rgw_meta_sync_info::StateBuildingFullSyncMaps:
tn->log(20, "building full sync maps");
r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
if (r == -EBUSY || r == -EAGAIN) {
if (r == -EBUSY || r == -EIO) {
backoff.backoff_sleep();
continue;
}
Expand Down