Skip to content

Commit

Permalink
Improve fabric_util get_db timeout logic
Browse files Browse the repository at this point in the history
Previously, users with low {Q, N} dbs often got the `"No DB shards could be
opened."` error when the cluster is overloaded. The hard-coded 100 msec timeout
was too low to open the few available shards and the whole request would crash
with a 500 error.

Attempt to calculate an optimal timeout value based on the number of shards and
the max fabric request timeout limit.

The sequence of doubling (by default) timeouts forms a geometric progression.
Use the well known closed form formula for the sum [0], and the maximum request
timeout, to calculate the initial timeout. The test case illustrates a few
examples with some default Q and N values.

Because we don't want the timeout value to be too low, since it takes time to
open shards, and we don't want to quickly cycle through a few initial shards
and discard the results, the minimum inital timeout is clipped to the
previously hard-coded 100 msec timeout. Unlike previously however, this minimum
value can now also be configured.

Another issue with the previous code was that it was emitting a generic error
without a specific reason why the shards could not be opened. Timeout was the
most likely reason, but to confirm user either had to enable debug logging, or
apply clever erlang tracing on the `couch_log:debug/2` call. So as an
improvement, emit the reason string into the get_shard/5 recursive call so it
can be bubbled up with the error tuple.

[0] https://en.wikipedia.org/wiki/Geometric_series

Fixes: #3733
  • Loading branch information
nickva committed Sep 8, 2021
1 parent 64281c0 commit 45443c0
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 12 deletions.
1 change: 1 addition & 0 deletions rel/overlay/etc/default.ini
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ bind_address = 127.0.0.1
; all_docs_concurrency = 10
; changes_duration =
; shard_timeout_factor = 2
; shard_timeout_min_msec = 100
; uuid_prefix_len = 7
; request_timeout = 60000
; all_docs_timeout = 10000
Expand Down
90 changes: 78 additions & 12 deletions src/fabric/src/fabric_util.erl
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,22 @@ get_db(DbName) ->
get_db(DbName, []).

get_db(DbName, Options) ->
{Local, SameZone, DifferentZone} = mem3:group_by_proximity(mem3:shards(DbName)),
LiveShards = mem3:live_shards(DbName, [node() | nodes()]),
{Local, SameZone, DifferentZone} = mem3:group_by_proximity(LiveShards),
% Prefer shards on the same node over other nodes, prefer shards in the same zone over
% over zones and sort each remote list by name so that we don't repeatedly try the same node.
Shards = Local ++ lists:keysort(#shard.name, SameZone) ++ lists:keysort(#shard.name, DifferentZone),
% suppress shards from down nodes
Nodes = [node()|erlang:nodes()],
Live = [S || #shard{node = N} = S <- Shards, lists:member(N, Nodes)],
Factor = list_to_integer(config:get("fabric", "shard_timeout_factor", "2")),
get_shard(Live, Options, 100, Factor).

get_shard([], _Opts, _Timeout, _Factor) ->
erlang:error({internal_server_error, "No DB shards could be opened."});
get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) ->
% Only accept factors > 1, otherwise our math breaks further down
Factor = max(2, config:get_integer("fabric", "shard_timeout_factor", 2)),
MinTimeout = config:get_integer("fabric", "shard_timeout_min_msec", 100),
MaxTimeout = request_timeout(),
Timeout = get_db_timeout(length(Shards), Factor, MinTimeout, MaxTimeout),
get_shard(Shards, Options, Timeout, Factor, "No shards found").

get_shard([], _Opts, _Timeout, _Factor, Reason) ->
Msg = io_lib:format("No DB shards could be opened. Reason: ~s", [Reason]),
erlang:error({internal_server_error, lists:flatten(Msg)});
get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor, _) ->
Mon = rexi_monitor:start([rexi_utils:server_pid(Node)]),
MFA = {fabric_rpc, open_shard, [Name, [{timeout, Timeout} | Opts]]},
Ref = rexi:cast(Node, self(), MFA, [sync]),
Expand All @@ -125,15 +128,40 @@ get_shard([#shard{node = Node, name = Name} | Rest], Opts, Timeout, Factor) ->
throw(Error);
{Ref, Reason} ->
couch_log:debug("Failed to open shard ~p because: ~p", [Name, Reason]),
get_shard(Rest, Opts, Timeout, Factor)
FmtReason = lists:flatten(io_lib:format("~p", [Reason])),
get_shard(Rest, Opts, Timeout, Factor, FmtReason)
after Timeout ->
couch_log:debug("Failed to open shard ~p after: ~p", [Name, Timeout]),
get_shard(Rest, Opts, Factor * Timeout, Factor)
NextTimeout = min(request_timeout(), Factor * Timeout),
FmtReason = lists:flatten(io_lib:format("timeout ~B msec", [Timeout])),
get_shard(Rest, Opts, NextTimeout, Factor, FmtReason)
end
after
rexi_monitor:stop(Mon)
end.


get_db_timeout(N, Factor, MinTimeout, MaxTimeout) ->
%
% The progression of timeouts forms a geometric series:
%
% MaxTimeout = T + T*F + T*F^2 + T*F^3 ...
%
% Where T is the initial timeout and F is the factor. The formula for
% the sum is:
%
% Sum[T * F^I, I <- 0..N] = T * (1 - F^(N + 1)) / (1 - F)
%
% Then, for a given sum and factor we can calculate the initial timeout T:
%
% T = Sum / ((1 - F^(N+1)) / (1 - F))
%
Timeout = MaxTimeout / ((1 - math:pow(Factor, N + 1)) / (1 - Factor)),

% Apply a minimum timeout value
max(MinTimeout, trunc(Timeout)).


error_info({{timeout, _} = Error, _Stack}) ->
Error;
error_info({{Error, Reason}, Stack}) ->
Expand Down Expand Up @@ -400,3 +428,41 @@ do_isolate(Fun) ->


-endif.


get_db_timeout_test() ->
% Q=1, N=1
?assertEqual(20000, get_db_timeout(1, 2, 100, 60000)),

% Q=2, N=1
?assertEqual(8571, get_db_timeout(2, 2, 100, 60000)),

% Q=2, N=3 (default)
?assertEqual(472, get_db_timeout(2 * 3, 2, 100, 60000)),

% Q=3, N=3
?assertEqual(100, get_db_timeout(3 * 3, 2, 100, 60000)),

% Q=4, N=1
?assertEqual(1935, get_db_timeout(4, 2, 100, 60000)),

% Q=8, N=1
?assertEqual(117, get_db_timeout(8, 2, 100, 60000)),

% Q=8, N=3 (default in 2.x)
?assertEqual(100, get_db_timeout(8 * 3, 2, 100, 60000)),

% Q=256, N=3
?assertEqual(100, get_db_timeout(256 * 3, 2, 100, 60000)),

% Large factor = 100
?assertEqual(100, get_db_timeout(2 * 3, 100, 100, 60000)),

% Small total request timeout = 1 sec
?assertEqual(100, get_db_timeout(2 * 3, 2, 100, 1000)),

% Large total request timeout
?assertEqual(28346, get_db_timeout(2 * 3, 2, 100, 3600000)),

% No shards at all
?assertEqual(60000, get_db_timeout(0, 2, 100, 60000)).

0 comments on commit 45443c0

Please sign in to comment.