From 1ba0e4ac4c919cec430dd94a0a247704b059a75a Mon Sep 17 00:00:00 2001 From: Nick Vatamaniuc Date: Thu, 28 Oct 2021 16:53:55 -0400 Subject: [PATCH 1/2] Use configured shards db in custodian instead of `"dbs"` --- src/custodian/README | 2 +- src/custodian/src/custodian_server.erl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/custodian/README b/src/custodian/README index 72681f447ec..ff88373c5ff 100644 --- a/src/custodian/README +++ b/src/custodian/README @@ -1,6 +1,6 @@ Custodian is responsible for the data stored in CouchDB databases. -Custodian scans the "dbs" database, which details the location of +Custodian scans the shards database, which details the location of every shard of every database and ensures that operators are aware of any shard that is under-replicated (has less than N copies). diff --git a/src/custodian/src/custodian_server.erl b/src/custodian/src/custodian_server.erl index 0a21eed2310..0c8b87e8780 100644 --- a/src/custodian/src/custodian_server.erl +++ b/src/custodian/src/custodian_server.erl @@ -132,8 +132,9 @@ start_shard_checker(#state{shard_checker=Pid}=State) when is_pid(Pid) -> start_event_listener() -> + DbName = mem3_sync:shards_db(), couch_event:link_listener( - ?MODULE, handle_db_event, nil, [{dbname, <<"dbs">>}] + ?MODULE, handle_db_event, nil, [{dbname, DbName}] ). handle_db_event(_DbName, updated, _St) -> From 8d86b39947b72cc46858a614d5f38ebf1b4341bf Mon Sep 17 00:00:00 2001 From: Nick Vatamaniuc Date: Thu, 28 Oct 2021 16:54:53 -0400 Subject: [PATCH 2/2] Eliminate custodian false positive errors for dbs with N < default N Previously, dbs with N < cluster default N would pollute logs with critical errors regarding not having enough shards. Instead, use each database's expected N value to emit custodian reports. Note: the expected N value is a bit tricky to understand since with shard splitting feature, shard ranges are not guaranteed to exactly match for all copies. The N value is then defined as the max number of rings which can be completed with the given set of shards -- complete the ring once, remove participating shards, try again, etc. Lucky for us, that function is already written (`mem3_util:calculate_max_n(Shards)` so we are just re-using it. --- src/custodian/src/custodian_util.erl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/custodian/src/custodian_util.erl b/src/custodian/src/custodian_util.erl index ac46cb143ea..6d5a56093c5 100644 --- a/src/custodian/src/custodian_util.erl +++ b/src/custodian/src/custodian_util.erl @@ -21,7 +21,7 @@ % Old design doc which should be cleaned up -define(CUSTODIAN_ID, <<"_design/custodian">>). --record(state, {live, safe, n, callback, db, acc}). +-record(state, {live, safe, callback, db, acc}). %% public functions. @@ -55,10 +55,9 @@ ensure_dbs_exists() -> fold_dbs(Acc, Fun) -> Safe = maybe_redirect([node() | nodes()]), Live = Safe -- maintenance_nodes(Safe), - N = cluster_n(), {ok, Db} = ensure_dbs_exists(), try - State0 = #state{live=Live, safe=Safe, n=N, callback=Fun, db=Db, acc=Acc}, + State0 = #state{live=Live, safe=Safe, callback=Fun, db=Db, acc=Acc}, {ok, State1} = couch_db:fold_docs(Db, fun fold_dbs1/2, State0, []), State1#state.acc after @@ -82,9 +81,9 @@ fold_dbs1(#full_doc_info{id = Id} = FDI, State) -> fold_dbs(Id, Shards, State) -> IsSafe = fun(#shard{node = N}) -> lists:member(N, State#state.safe) end, IsLive = fun(#shard{node = N}) -> lists:member(N, State#state.live) end, - TargetN = State#state.n, LiveShards = lists:filter(IsLive, Shards), SafeShards = lists:filter(IsSafe, Shards), + TargetN = mem3_util:calculate_max_n(Shards), Acc0 = State#state.acc, Acc1 = case mem3_util:calculate_max_n(LiveShards) of LiveN when LiveN < TargetN ->