From 234e863c908bed69b8f073404a5f130547a15e5a Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 27 Mar 2023 13:18:49 +0100 Subject: [PATCH 01/30] Configuration and module alignment Add claim function and target_n_val configuration into cuttlefish. Move modules around to try and make it more obvious where functions used in membership reside. --- eqc/new_cluster_membership_model_eqc.erl | 6 +- priv/riak_core.schema | 77 ++++ src/riak_core.app.src | 6 +- src/riak_core_claim_sim.erl | 23 +- src/riak_core_claimant.erl | 11 +- src/riak_core_gossip.erl | 317 +--------------- ...aim.erl => riak_core_membership_claim.erl} | 18 +- src/riak_core_membership_leave.erl | 343 ++++++++++++++++++ src/riak_core_new_claim.erl | 39 -- test/claim_simulation.erl | 17 +- test/rack_awareness_test.erl | 6 +- test/riak_core_claim_statem.erl | 6 +- 12 files changed, 468 insertions(+), 401 deletions(-) rename src/{riak_core_claim.erl => riak_core_membership_claim.erl} (99%) create mode 100644 src/riak_core_membership_leave.erl delete mode 100644 src/riak_core_new_claim.erl diff --git a/eqc/new_cluster_membership_model_eqc.erl b/eqc/new_cluster_membership_model_eqc.erl index 3589f0f36..f32f48b05 100644 --- a/eqc/new_cluster_membership_model_eqc.erl +++ b/eqc/new_cluster_membership_model_eqc.erl @@ -1608,14 +1608,14 @@ handle_down_nodes(CState, Next) -> claim_until_balanced(Ring, Node) -> %%{WMod, WFun} = app_helper:get_env(riak_core, wants_claim_fun), - {WMod, WFun} = {riak_core_claim, default_wants_claim}, + {WMod, WFun} = {riak_core_membership_claim, default_wants_claim}, NeedsIndexes = apply(WMod, WFun, [Ring, Node]), case NeedsIndexes of no -> Ring; {yes, _NumToClaim} -> %%{CMod, CFun} = app_helper:get_env(riak_core, choose_claim_fun), - {CMod, CFun} = {riak_core_claim, default_choose_claim}, + {CMod, CFun} = {riak_core_membership_claim, default_choose_claim}, NewRing = CMod:CFun(Ring, Node), claim_until_balanced(NewRing, Node) end. @@ -1682,7 +1682,7 @@ remove_from_cluster(Ring, ExitingNode) -> end, Ring, AllOwners), - riak_core_claim:claim_rebalance_n(TempRing, Other) + riak_core_membership_claim:claim_rebalance_n(TempRing, Other) end, ExitRing. diff --git a/priv/riak_core.schema b/priv/riak_core.schema index 7e578a4c6..f85b2c61c 100644 --- a/priv/riak_core.schema +++ b/priv/riak_core.schema @@ -227,6 +227,83 @@ hidden ]}. +%% @doc Choose claim function +%% Claim function to be used when handling joins to the cluster. +%% There are three supported functions: +%% - choose_claim_v2 (the default) designed for environments without location +%% awareness as a requirement +%% - choose_claim_v3 (deprecated) a claim function which treats claim as an +%% optimisation problem. It creates a number of possible claim plans and +%% evaluates them for violations, balance and diversity, choosing the 'best' +%% plan. claim_v3 is not location aware +%% - choose_claim_v4 a claim algorithm which refactors v2 to improve location +%% awareness +{mapping, "choose_claim_fun", "riak_core.choose_claim_fun", [ + {commented, "choose_claim_v2"}, + {datatype, {enum, [choose_claim_v2, choose_claim_v3, choose_claim_v4]}}, + merge +]}. + +%% @doc Target N Val for Cluster Administration +%% Cluster change operations such as joins and leaves will use a target_n_val +%% to control spacing of preflists across physical nodes. The default value +%% is 4, which is the default bucket propery for n_val + 1. This means that +%% the target for a cluster change operation is to make sure that all preflists +%% of n_val 3 are on 3 deperate physical devices, even when a single failure +%% has occurred. +%% If the target_n_val is not met by a cluster chnage operation, the failure is +%% not blocking - a warning will be printed in the cluster plan, but the plan +%% will not be prevented from being committed. +%% In some cases, by reducing the target_n_val it may be possible to reduce the +%% number of transfers necessary to complete a cluster change operation. +%% In clusters with a large number of nodes, larger target_n_val values can be +%% supported, and may result to a better spread of load across the cluster +%% when node failure occurs. +{mapping, "target_n_val", "riak_core.target_n_val", [ + {datatype, integer}, + {default, 4}, + {validators, ["target_nval_max", "target_nval_min"]}, + {commented, 4} +]}. + +%% ring_size validators +{validator, "target_nval_max", + "7 and larger are supported, but considered advanced config", + fun(Size) -> + Size =< 6 + end}. + +{validator, "target_nval_min", "must be at least 1", + fun(Size) -> + Size >= 1 + end}. + +%% @doc Target Location N Val for Cluster Administration +%% Cluster change operations such as joins and leaves will use a +%% target_location_n_val to control spacing of preflists across locations. This +%% is to support clusters which have a concept of `location` failure as well as +%% Node failure (e.g. rack awareness is required, or support for AWS placement +%% groups). +%% In this case, nodes are assigned to locations, and as well as supporting +%% the splitting of data replicas across nodes, attempts will also be made +%% during cluster chnage operations to split preflists across locations. +%% If the target_location_n_val is not met by a cluster chnage operation, the failure is +%% not blocking - a warning will be printed in the cluster plan, but the plan +%% will not be prevented from being committed. +%% In some cases, by reducing the target_location_n_val it may be possible to +%% reduce the number of transfers necessary to complete a cluster change +%% operation. +%% In clusters with a large number of nodes, larger target_location_n_val +%% values can be supported. +%% If the target_location_nval is greater than the target_nval, the target_nval +%% will be used. +{mapping, "target_location_n_val", "riak_core.target_location_n_val", [ + {datatype, integer}, + {default, 3}, + {validators, ["target_nval_max", "target_nval_min"]}, + {commented, 3} +]}. + %% @doc On cluster leave - force full rebalance partitions %% By default on a cluster leave there will first be an attempt to handoff %% vnodes to safe (in terms of target_n_val) locations. In small clusters, diff --git a/src/riak_core.app.src b/src/riak_core.app.src index da371da20..89f0d8420 100644 --- a/src/riak_core.app.src +++ b/src/riak_core.app.src @@ -42,8 +42,10 @@ {target_n_val, 4}, %% Default claims functions - {wants_claim_fun, {riak_core_claim, default_wants_claim}}, - {choose_claim_fun, {riak_core_claim, default_choose_claim}}, + {wants_claim_fun, + {riak_core_membership_claim, default_wants_claim}}, + {choose_claim_fun, + {riak_core_memberhsip_claim, default_choose_claim}}, %% Vnode inactivity timeout (how often to check if fallback vnodes %% should return their data) in ms. diff --git a/src/riak_core_claim_sim.erl b/src/riak_core_claim_sim.erl index 3ef57e790..b6c8a9a7e 100644 --- a/src/riak_core_claim_sim.erl +++ b/src/riak_core_claim_sim.erl @@ -152,7 +152,7 @@ add_choose_params(Choose, TN) -> {CMod, CFun, Params1}. run_rebalance(Ring, Wants, Choose, Rebalance) -> - Ring2 = riak_core_claim:claim(Ring, Wants, Choose), + Ring2 = riak_core_membership_claim:claim(Ring, Wants, Choose), Rebalance(Ring, Ring2), Ring2. @@ -517,9 +517,12 @@ commission_tests_rest() -> ]. commission_claims() -> - [{{riak_core_claim, wants_claim_v1}, {riak_core_claim, choose_claim_v1}}, - {{riak_core_claim, wants_claim_v2}, {riak_core_claim, choose_claim_v2}}, - {{riak_core_claim, wants_claim_v3}, {riak_core_claim, choose_claim_v3}}]. + [{{riak_core_membership_claim, wants_claim_v1}, + {riak_core_membership_claim, choose_claim_v1}}, + {{riak_core_membership_claim, wants_claim_v2}, + {riak_core_membership_claim, choose_claim_v2}}, + {{riak_core_membership_claim, wants_claim_v3}, + {riak_core_memberhsip_claim, choose_claim_v3}}]. %% ------------------------------------------------------------------- @@ -532,15 +535,15 @@ run_test() -> Ring = riak_core_ring:fresh(64, anode), ?assertEqual(ok, run([{ring, Ring}, {target_n_val,2}, - {wants,{riak_core_claim,wants_claim_v2}}, - {choose,{riak_core_claim,choose_claim_v2}}, + {wants,{riak_core_membership_claim,wants_claim_v2}}, + {choose,{riak_core_membership_claim,choose_claim_v2}}, {cmds, [[{join,a}],[{join,b}]]}, {print,false}, {return_ring, false}])), Ring2 = run([{ring, Ring}, {target_n_val,2}, - {wants,{riak_core_claim,wants_claim_v2}}, - {choose,{riak_core_claim,choose_claim_v2}}, + {wants,{riak_core_membership_claim,wants_claim_v2}}, + {choose,{riak_core_membership_claim,choose_claim_v2}}, {cmds, [[{join,a}],[{join,b}]]}, {print,false}, {return_ring, true}]), @@ -548,8 +551,8 @@ run_test() -> {ok, Fh} = file:open("sim.out", [write]), ?assertEqual(ok, run([{ring, Ring2}, {target_n_val,4}, - {wants,{riak_core_claim,wants_claim_v1}}, - {choose,{riak_core_claim,choose_claim_v1}}, + {wants,{riak_core_membership_claim,wants_claim_v1}}, + {choose,{riak_core_membership_claim,choose_claim_v1}}, {cmds, [[{join,3}]]}, {analysis, [{failures, 2},{n_val, 3}]}, {print,Fh}, diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index a02ecc0aa..9b862e010 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -1456,7 +1456,7 @@ rebalance_ring(CNode, CState) -> rebalance_ring(CNode, Next, CState). rebalance_ring(_CNode, [], CState) -> - CState2 = riak_core_claim:claim(CState), + CState2 = riak_core_membership_claim:claim(CState), Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CState2), Owners3 = lists:zip(Owners1, Owners2), @@ -1505,13 +1505,16 @@ remove_node(CState, Node, Status, Replacing, Seed, Log) -> remove_node(CState, _Node, _Status, _Replacing, _Seed, _Log, []) -> CState; remove_node(CState, Node, Status, Replacing, Seed, Log, Indices) -> - CStateT1 = riak_core_ring:change_owners(CState, - riak_core_ring:all_next_owners(CState)), + CStateT1 = + riak_core_ring:change_owners( + CState, riak_core_ring:all_next_owners(CState)), case orddict:find(Node, Replacing) of {ok, NewNode} -> CStateT2 = reassign_indices_to(Node, NewNode, CStateT1); error -> - CStateT2 = riak_core_gossip:remove_from_cluster(CStateT1, Node, Seed) + CStateT2 = + riak_core_membership_leave:remove_from_cluster( + CStateT1, Node, Seed) end, Owners1 = riak_core_ring:all_owners(CState), diff --git a/src/riak_core_gossip.erl b/src/riak_core_gossip.erl index 4d40477ce..3c108549c 100644 --- a/src/riak_core_gossip.erl +++ b/src/riak_core_gossip.erl @@ -36,17 +36,12 @@ -export([start_link/0, stop/0]). -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --export ([distribute_ring/1, send_ring/1, send_ring/2, remove_from_cluster/2, - remove_from_cluster/3, random_gossip/1, +-export ([distribute_ring/1, send_ring/1, send_ring/2, random_gossip/1, recursive_gossip/1, random_recursive_gossip/1, rejoin/2, gossip_version/0]). -include("riak_core_ring.hrl"). --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). --endif. - %% Default gossip rate: allow at most 45 gossip messages every 10 seconds -define(DEFAULT_LIMIT, {45, 10000}). @@ -367,314 +362,4 @@ log_node_added(Node, New) -> log_node_removed(Node, Old) -> lager:info("'~s' removed from cluster (previously: '~s')~n", [Node, Old]). -remove_from_cluster(Ring, ExitingNode) -> - remove_from_cluster(Ring, ExitingNode, rand:seed(exrop, os:timestamp())). - -remove_from_cluster(Ring, ExitingNode, Seed) -> - % Transfer indexes to other nodes... - Owners = riak_core_ring:all_owners(Ring), - Members = riak_core_ring:claiming_members(Ring), - ExitRing = - case attempt_simple_transfer(Ring, ExitingNode, Seed, - Owners, Members) of - {ok, NR} -> - NR; - _ -> - %% re-diagonalize - %% first hand off all claims to *any* one else, - %% just so rebalance doesn't include exiting node - HN = hd(lists:delete(ExitingNode, Members)), - TempRing = - lists:foldl(fun({I,N}, R) when N == ExitingNode -> - riak_core_ring:transfer_node(I, HN, R); - (_, R) -> - R - end, - Ring, - Owners), - riak_core_claim:sequential_claim(TempRing, HN) - end, - ExitRing. - --ifdef(TEST). --type transfer_ring() :: [{integer(), term()}]. --else. --type transfer_ring() :: riak_core_ring:riak_core_ring(). --endif. - -%% @doc Simple transfer of leaving node's vnodes to safe place -%% Where safe place is any node that satisfies target_n_val for that vnode - -%% but with a preference to transfer to a node that has a lower number of -%% vnodes currently allocated. -%% If safe places cannot be found for all vnodes returns `target_n_fail` -%% Simple transfer is not location aware, but generally this wll be an initial -%% phase of a plan, and hence a temporary home - so location awareness is not -%% necessary. -%% `riak_core.full_rebalance_onleave = true` may be used to avoid this step, -%% although this may result in a large number of transfers --spec attempt_simple_transfer(transfer_ring(), - term(), - random:ran(), - [{integer(), term()}], - [term()]) -> - {ok, transfer_ring()}| - target_n_fail| - force_rebalance. -attempt_simple_transfer(Ring, ExitingNode, Seed, Owners, Members) -> - ForceRebalance = - app_helper:get_env(riak_core, full_rebalance_onleave, false), - case ForceRebalance of - true -> - force_rebalance; - false -> - TargetN = app_helper:get_env(riak_core, target_n_val), - Counts = - riak_core_claim:get_counts(Members, Owners), - RingFun = - fun(Partition, Node, R) -> - riak_core_ring:transfer_node(Partition, Node, R), - R - end, - simple_transfer(Owners, - {RingFun, TargetN, ExitingNode}, - Ring, - {Seed, [], Counts}) - end. - -%% @doc Simple transfer of leaving node's vnodes to safe place -%% Iterates over Owners, which must be sorted by Index (from 0...), and -%% attempts to safely re-allocate each ownerhsip which is currently set to -%% the exiting node --spec simple_transfer([{integer(), term()}], - {fun((integer(), - term(), - transfer_ring()) -> transfer_ring()), - pos_integer(), - term()}, - transfer_ring(), - {random:ran(), - [{integer(), term()}], - [{term(), non_neg_integer()}]}) -> - {ok, transfer_ring()}|target_n_fail. -simple_transfer([{P, ExitingNode}|Rest], - {RingFun, TargetN, ExitingNode}, - Ring, - {Seed, Prev, Counts}) -> - %% The ring is split into two parts: - %% Rest - this is forward looking from the current partition, in partition - %% order (ascending by partition number) - %% Prev - this is the part of the ring that has already been processed, - %% which is also in partition order (but descending by index number) - %% - %% With a ring size of 8, having looped to partition 3: - %% Rest = [{4, N4}, {5, N5}, {6, N6}, {7, N7}] - %% Prev = [{2, N2}, {1, N1}, {0, N0}] - %% - %% If we have a partition that is on the Exiting Node it is necessary to - %% look forward (TargetN - 1) allocations in Rest. It is also necessary - %% to look backward (TargetN - 1) allocations in Prev (from the rear of the - %% Prev list). - %% - %% This must be treated as a Ring though - as we reach an end of the list - %% the search must wrap around to the other end of the alternate list (i.e. - %% from 0 -> 7 and from 7 -> 0). - CheckRingFun = - fun(ForwardL, BackL) -> - Steps = TargetN - 1, - UnsafeNodeTuples = - case length(ForwardL) of - L when L < Steps -> - ForwardL ++ - lists:sublist(lists:reverse(BackL), Steps - L); - _ -> - lists:sublist(ForwardL, Steps) - end, - fun({Node, _Count}) -> - %% Nodes will remian as candidates if they are not in the list - %% of unsafe nodes - not lists:keymember(Node, 2, UnsafeNodeTuples) - end - end, - %% Filter candidate Nodes looking back in the ring at previous allocations. - %% The starting list of candidates is the list the claiming members in - %% Counts. - CandidatesB = lists:filter(CheckRingFun(Prev, Rest), Counts), - %% Filter candidate Nodes looking forward in the ring at existing - %% allocations - CandidatesF = lists:filter(CheckRingFun(Rest, Prev), CandidatesB), - - %% Qualifying candidates will be tuples of {Node, Count} where the Count - %% is that node's current count of allocated vnodes - case CandidatesF of - [] -> - target_n_fail; - Qualifiers -> - %% Look at the current allocated vnode counts for each qualifying - %% node, and find all qualifying nodes with the lowest of these - %% counts - [{Q0, BestCnt}|Others] = lists:keysort(2, Qualifiers), - PreferredCandidates = - [{Q0, BestCnt}| - lists:takewhile(fun({_, C}) -> C == BestCnt end, Others)], - - %% Final selection of a node as a destination for this partition, - %% The node Counts must be updated to reflect this allocation, and - %% the RingFun applied to actually queue the transfer - {Rand, Seed2} = rand:uniform_s(length(PreferredCandidates), Seed), - {Chosen, BestCnt} = lists:nth(Rand, PreferredCandidates), - UpdRing = RingFun(P, Chosen, Ring), - UpdCounts = - lists:keyreplace(Chosen, 1, Counts, {Chosen, BestCnt + 1}), - simple_transfer(Rest, - {RingFun, TargetN, ExitingNode}, - UpdRing, - {Seed2, [{P, Chosen}|Prev], UpdCounts}) - end; -simple_transfer([{P, N}|Rest], Statics, Ring, {Seed, Prev, Counts}) -> - %% This is already allocated to a node other than the exiting node, so - %% simply transition to the Previous ring accumulator - simple_transfer(Rest, Statics, Ring, {Seed, [{P, N}|Prev], Counts}); -simple_transfer([], _Statics, Ring, _LoopAccs) -> - {ok, Ring}. - - -%% =================================================================== -%% Unit tests -%% =================================================================== - --ifdef(TEST). - -test_ring_fun(P, N, R) -> - lists:keyreplace(P, 1, R, {P, N}). - -count_nodes(TestRing) -> - CountFun = - fun({_P, N}, Acc) -> - case lists:keyfind(N, 1, Acc) of - false -> - lists:ukeysort(1, [{N, 1}|Acc]); - {N, C} -> - lists:ukeysort(1, [{N, C + 1}|Acc]) - end - end, - lists:foldl(CountFun, [], TestRing). - -simple_transfer_simple_test() -> - R0 = [{0, n5}, {1, n1}, {2, n2}, {3, n3}, - {4, n4}, {5, n5}, {6, n3}, {7, n2}], - SomeTime = {1632,989499,279637}, - FixedSeed = rand:seed(exrop, SomeTime), - {ok, R1} = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n4}, - R0, - {FixedSeed, - [], - lists:keydelete(n4, 1, count_nodes(R0))}), - ?assertMatch({4, n1}, lists:keyfind(4, 1, R1)), - - {ok, R2} = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n5}, - R0, - {FixedSeed, - [], - lists:keydelete(n5, 1, count_nodes(R0))}), - ?assertMatch({0, n4}, lists:keyfind(0, 1, R2)), - ?assertMatch({5, n1}, lists:keyfind(5, 1, R2)), - - {ok, R3} = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n1}, - R0, - {FixedSeed, - [], - lists:keydelete(n1, 1, count_nodes(R0))}), - ?assertMatch({1, n4}, lists:keyfind(1, 1, R3)), - - target_n_fail = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n3}, - R0, - {FixedSeed, - [], - lists:keydelete(n3, 1, count_nodes(R0))}), - - target_n_fail = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n2}, - R0, - {FixedSeed, - [], - lists:keydelete(n2, 1, count_nodes(R0))}), - - %% Target n failures due to wrap-around tail violations - R4 = [{0, n5}, {1, n1}, {2, n2}, {3, n3}, - {4, n4}, {5, n2}, {6, n3}, {7, n4}], - - target_n_fail = - simple_transfer(R4, - {fun test_ring_fun/3, 3, n5}, - R4, - {FixedSeed, - [], - lists:keydelete(n5, 1, count_nodes(R4))}), - - target_n_fail = - simple_transfer(R4, - {fun test_ring_fun/3, 3, n4}, - R4, - {FixedSeed, - [], - lists:keydelete(n4, 1, count_nodes(R4))}). - -simple_transfer_needstobesorted_test() -> - lists:foreach(fun transfer_needstobesorted_tester/1, lists:seq(1, 100)). - -transfer_needstobesorted_tester(I) -> - R0 = [{6,n3}, {13,n3}, {12,n6}, {11,n5}, {10,n4}, {9,n3}, {8,n2}, - {7,n1}, {5,n6}, {4,n5}, {3,n4}, {2,n3}, {1,n2}, {0,n1}], - VariableSeed = rand:seed(exrop, {1632, 989499, I * 13}), - {ok, R1} = - simple_transfer(lists:keysort(1, R0), - {fun test_ring_fun/3, 3, n3}, - R0, - {VariableSeed, - [], - lists:keydelete(n3, 1, count_nodes(R0))}), - ?assertMatch({13, n4}, lists:keyfind(13, 1, R1)). - -simple_transfer_evendistribution_test() -> - R0 = [{0, n1}, {1, n2}, {2, n3}, {3, n4}, {4, n5}, - {5, n6}, {6, n7}, {7, n8}, {8, n9}, {9, n10}, - {10, n1}, {11, n2}, {12, n3}, {13, n4}, {14, n5}, - {15, n6}, {16, n7}, {17, n8}, {18, n9}, {19, n10}, - {20, n1}, {21, n2}, {22, n3}, {23, n4}, {24, n5}, - {25, n6}, {26, n7}, {27, n8}, {28, n9}, {29, n10}, - {30, n1}, {31, n2}, {32, n3}, {33, n4}, {34, n5}, - {35, n6}, {36, n7}, {37, n8}, {38, n9}, {39, n10}, - {40, n1}, {41, n2}, {42, n3}, {43, n4}, {44, n5}, - {45, n6}, {46, n7}, {47, n8}, {48, n9}, {49, n10}, - {50, n1}, {51, n2}, {52, n3}, {53, n4}, {54, n5}, - {55, n6}, {56, n1}, {57, n2}, {58, n3}, {59, n10}, - {60, n5}, {61, n6}, {62, n7}, {63, n8}], - - SomeTime = {1632,989499,279637}, - FixedSeed = rand:seed(exrop, SomeTime), - {ok, R1} = - simple_transfer(R0, - {fun test_ring_fun/3, 3, n1}, - R0, - {FixedSeed, - [], - lists:keydelete(n1, 1, count_nodes(R0))}), - - NodeCounts = lists:keysort(2, count_nodes(R1)), - io:format("NodeCounts ~w~n", [NodeCounts]), - [{_LN, LC}|Rest] = NodeCounts, - [{_HN, HC}|_] = lists:reverse(Rest), - true = HC - LC == 2. - - --endif. diff --git a/src/riak_core_claim.erl b/src/riak_core_membership_claim.erl similarity index 99% rename from src/riak_core_claim.erl rename to src/riak_core_membership_claim.erl index d337a2f31..1b8095038 100644 --- a/src/riak_core_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -34,13 +34,13 @@ %% number of partitions owned by the same node. %% The exact amount that is considered tolerable is determined by the -%% application env variable "target_n_val". The functions in riak_core_claim -%% will ensure that all sequences up to target_n_val long contain no repeats if -%% at all possible. The effect of this is that when the number of nodes in the -%% system is smaller than target_n_val, a potentially large number of partitions -%% must be moved in order to safely add a new node. After the cluster has grown -%% beyond that size, a minimal number of partitions (1/NumNodes) will generally -%% be moved. +%% application env variable "target_n_val". The functions in +%% riak_core_membership claim will ensure that all sequences up to target_n_val +%% long contain no repeats if at all possible. The effect of this is that when +%% the number of nodes in the system is smaller than target_n_val, a potentially +%% large number of partitions must be moved in order to safely add a new node. +%% After the cluster has grown beyond that size, a minimal number of partitions +%% (1/NumNodes) will generally be moved. %% If the number of nodes does not divide evenly into the number of partitions, %% it may not be possible to perfectly achieve the maximum spread constraint. @@ -51,7 +51,7 @@ %% to set it to the largest value you expect to use for any bucket's n_val. The %% default is 4. --module(riak_core_claim). +-module(riak_core_membership_claim). -export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4]). -export([default_wants_claim/1, default_wants_claim/2, default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, @@ -334,7 +334,7 @@ choose_claim_v2(RingOrig, Node, Params0) -> Padding = lists:duplicate(TargetN, undefined), Expanded = lists:sublist(Active ++ Padding, TargetN), ExpandedLocation = get_nodes_by_location(Expanded, Ring), - PreferredClaim = riak_core_claim:diagonal_stripe(Ring, ExpandedLocation), + PreferredClaim = diagonal_stripe(Ring, ExpandedLocation), PreferredNth = [begin {Nth, Idx} = lists:keyfind(Idx, 2, AllIndices), Nth diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl new file mode 100644 index 000000000..badccc7ce --- /dev/null +++ b/src/riak_core_membership_leave.erl @@ -0,0 +1,343 @@ +%% ------------------------------------------------------------------- +%% +%% riak_core: Core Riak Application +%% +%% Copyright (c) 2007-2010 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +-module(riak_core_membership_leave). + +-include("riak_core_ring.hrl"). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. + +-export([remove_from_cluster/2, remove_from_cluster/3]). + +remove_from_cluster(Ring, ExitingNode) -> + remove_from_cluster(Ring, ExitingNode, rand:seed(exrop, os:timestamp())). + +remove_from_cluster(Ring, ExitingNode, Seed) -> + % Transfer indexes to other nodes... + Owners = riak_core_ring:all_owners(Ring), + Members = riak_core_ring:claiming_members(Ring), + ExitRing = + case attempt_simple_transfer(Ring, ExitingNode, Seed, + Owners, Members) of + {ok, NR} -> + NR; + _ -> + %% re-diagonalize + %% first hand off all claims to *any* one else, + %% just so rebalance doesn't include exiting node + HN = hd(lists:delete(ExitingNode, Members)), + TempRing = + lists:foldl(fun({I,N}, R) when N == ExitingNode -> + riak_core_ring:transfer_node(I, HN, R); + (_, R) -> + R + end, + Ring, + Owners), + riak_core_membership_claim:sequential_claim(TempRing, HN) + end, + ExitRing. + +-ifdef(TEST). +-type transfer_ring() :: [{integer(), term()}]. +-else. +-type transfer_ring() :: riak_core_ring:riak_core_ring(). +-endif. + +%% @doc Simple transfer of leaving node's vnodes to safe place +%% Where safe place is any node that satisfies target_n_val for that vnode - +%% but with a preference to transfer to a node that has a lower number of +%% vnodes currently allocated. +%% If safe places cannot be found for all vnodes returns `target_n_fail` +%% Simple transfer is not location aware, but generally this wll be an initial +%% phase of a plan, and hence a temporary home - so location awareness is not +%% necessary. +%% `riak_core.full_rebalance_onleave = true` may be used to avoid this step, +%% although this may result in a large number of transfers +-spec attempt_simple_transfer(transfer_ring(), + term(), + random:ran(), + [{integer(), term()}], + [term()]) -> + {ok, transfer_ring()}| + target_n_fail| + force_rebalance. +attempt_simple_transfer(Ring, ExitingNode, Seed, Owners, Members) -> + ForceRebalance = + app_helper:get_env(riak_core, full_rebalance_onleave, false), + case ForceRebalance of + true -> + force_rebalance; + false -> + TargetN = app_helper:get_env(riak_core, target_n_val), + Counts = + riak_core_membership_claim:get_counts(Members, Owners), + RingFun = + fun(Partition, Node, R) -> + riak_core_ring:transfer_node(Partition, Node, R), + R + end, + simple_transfer(Owners, + {RingFun, TargetN, ExitingNode}, + Ring, + {Seed, [], Counts}) + end. + +%% @doc Simple transfer of leaving node's vnodes to safe place +%% Iterates over Owners, which must be sorted by Index (from 0...), and +%% attempts to safely re-allocate each ownerhsip which is currently set to +%% the exiting node +-spec simple_transfer([{integer(), term()}], + {fun((integer(), + term(), + transfer_ring()) -> transfer_ring()), + pos_integer(), + term()}, + transfer_ring(), + {random:ran(), + [{integer(), term()}], + [{term(), non_neg_integer()}]}) -> + {ok, transfer_ring()}|target_n_fail. +simple_transfer([{P, ExitingNode}|Rest], + {RingFun, TargetN, ExitingNode}, + Ring, + {Seed, Prev, Counts}) -> + %% The ring is split into two parts: + %% Rest - this is forward looking from the current partition, in partition + %% order (ascending by partition number) + %% Prev - this is the part of the ring that has already been processed, + %% which is also in partition order (but descending by index number) + %% + %% With a ring size of 8, having looped to partition 3: + %% Rest = [{4, N4}, {5, N5}, {6, N6}, {7, N7}] + %% Prev = [{2, N2}, {1, N1}, {0, N0}] + %% + %% If we have a partition that is on the Exiting Node it is necessary to + %% look forward (TargetN - 1) allocations in Rest. It is also necessary + %% to look backward (TargetN - 1) allocations in Prev (from the rear of the + %% Prev list). + %% + %% This must be treated as a Ring though - as we reach an end of the list + %% the search must wrap around to the other end of the alternate list (i.e. + %% from 0 -> 7 and from 7 -> 0). + CheckRingFun = + fun(ForwardL, BackL) -> + Steps = TargetN - 1, + UnsafeNodeTuples = + case length(ForwardL) of + L when L < Steps -> + ForwardL ++ + lists:sublist(lists:reverse(BackL), Steps - L); + _ -> + lists:sublist(ForwardL, Steps) + end, + fun({Node, _Count}) -> + %% Nodes will remian as candidates if they are not in the list + %% of unsafe nodes + not lists:keymember(Node, 2, UnsafeNodeTuples) + end + end, + %% Filter candidate Nodes looking back in the ring at previous allocations. + %% The starting list of candidates is the list the claiming members in + %% Counts. + CandidatesB = lists:filter(CheckRingFun(Prev, Rest), Counts), + %% Filter candidate Nodes looking forward in the ring at existing + %% allocations + CandidatesF = lists:filter(CheckRingFun(Rest, Prev), CandidatesB), + + %% Qualifying candidates will be tuples of {Node, Count} where the Count + %% is that node's current count of allocated vnodes + case CandidatesF of + [] -> + target_n_fail; + Qualifiers -> + %% Look at the current allocated vnode counts for each qualifying + %% node, and find all qualifying nodes with the lowest of these + %% counts + [{Q0, BestCnt}|Others] = lists:keysort(2, Qualifiers), + PreferredCandidates = + [{Q0, BestCnt}| + lists:takewhile(fun({_, C}) -> C == BestCnt end, Others)], + + %% Final selection of a node as a destination for this partition, + %% The node Counts must be updated to reflect this allocation, and + %% the RingFun applied to actually queue the transfer + {Rand, Seed2} = rand:uniform_s(length(PreferredCandidates), Seed), + {Chosen, BestCnt} = lists:nth(Rand, PreferredCandidates), + UpdRing = RingFun(P, Chosen, Ring), + UpdCounts = + lists:keyreplace(Chosen, 1, Counts, {Chosen, BestCnt + 1}), + simple_transfer(Rest, + {RingFun, TargetN, ExitingNode}, + UpdRing, + {Seed2, [{P, Chosen}|Prev], UpdCounts}) + end; +simple_transfer([{P, N}|Rest], Statics, Ring, {Seed, Prev, Counts}) -> + %% This is already allocated to a node other than the exiting node, so + %% simply transition to the Previous ring accumulator + simple_transfer(Rest, Statics, Ring, {Seed, [{P, N}|Prev], Counts}); +simple_transfer([], _Statics, Ring, _LoopAccs) -> + {ok, Ring}. + + +%% =================================================================== +%% Unit tests +%% =================================================================== + +-ifdef(TEST). + +test_ring_fun(P, N, R) -> + lists:keyreplace(P, 1, R, {P, N}). + +count_nodes(TestRing) -> + CountFun = + fun({_P, N}, Acc) -> + case lists:keyfind(N, 1, Acc) of + false -> + lists:ukeysort(1, [{N, 1}|Acc]); + {N, C} -> + lists:ukeysort(1, [{N, C + 1}|Acc]) + end + end, + lists:foldl(CountFun, [], TestRing). + +simple_transfer_simple_test() -> + R0 = [{0, n5}, {1, n1}, {2, n2}, {3, n3}, + {4, n4}, {5, n5}, {6, n3}, {7, n2}], + SomeTime = {1632,989499,279637}, + FixedSeed = rand:seed(exrop, SomeTime), + {ok, R1} = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n4}, + R0, + {FixedSeed, + [], + lists:keydelete(n4, 1, count_nodes(R0))}), + ?assertMatch({4, n1}, lists:keyfind(4, 1, R1)), + + {ok, R2} = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n5}, + R0, + {FixedSeed, + [], + lists:keydelete(n5, 1, count_nodes(R0))}), + ?assertMatch({0, n4}, lists:keyfind(0, 1, R2)), + ?assertMatch({5, n1}, lists:keyfind(5, 1, R2)), + + {ok, R3} = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n1}, + R0, + {FixedSeed, + [], + lists:keydelete(n1, 1, count_nodes(R0))}), + ?assertMatch({1, n4}, lists:keyfind(1, 1, R3)), + + target_n_fail = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n3}, + R0, + {FixedSeed, + [], + lists:keydelete(n3, 1, count_nodes(R0))}), + + target_n_fail = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n2}, + R0, + {FixedSeed, + [], + lists:keydelete(n2, 1, count_nodes(R0))}), + + %% Target n failures due to wrap-around tail violations + R4 = [{0, n5}, {1, n1}, {2, n2}, {3, n3}, + {4, n4}, {5, n2}, {6, n3}, {7, n4}], + + target_n_fail = + simple_transfer(R4, + {fun test_ring_fun/3, 3, n5}, + R4, + {FixedSeed, + [], + lists:keydelete(n5, 1, count_nodes(R4))}), + + target_n_fail = + simple_transfer(R4, + {fun test_ring_fun/3, 3, n4}, + R4, + {FixedSeed, + [], + lists:keydelete(n4, 1, count_nodes(R4))}). + +simple_transfer_needstobesorted_test() -> + lists:foreach(fun transfer_needstobesorted_tester/1, lists:seq(1, 100)). + +transfer_needstobesorted_tester(I) -> + R0 = [{6,n3}, {13,n3}, {12,n6}, {11,n5}, {10,n4}, {9,n3}, {8,n2}, + {7,n1}, {5,n6}, {4,n5}, {3,n4}, {2,n3}, {1,n2}, {0,n1}], + VariableSeed = rand:seed(exrop, {1632, 989499, I * 13}), + {ok, R1} = + simple_transfer(lists:keysort(1, R0), + {fun test_ring_fun/3, 3, n3}, + R0, + {VariableSeed, + [], + lists:keydelete(n3, 1, count_nodes(R0))}), + ?assertMatch({13, n4}, lists:keyfind(13, 1, R1)). + +simple_transfer_evendistribution_test() -> + R0 = [{0, n1}, {1, n2}, {2, n3}, {3, n4}, {4, n5}, + {5, n6}, {6, n7}, {7, n8}, {8, n9}, {9, n10}, + {10, n1}, {11, n2}, {12, n3}, {13, n4}, {14, n5}, + {15, n6}, {16, n7}, {17, n8}, {18, n9}, {19, n10}, + {20, n1}, {21, n2}, {22, n3}, {23, n4}, {24, n5}, + {25, n6}, {26, n7}, {27, n8}, {28, n9}, {29, n10}, + {30, n1}, {31, n2}, {32, n3}, {33, n4}, {34, n5}, + {35, n6}, {36, n7}, {37, n8}, {38, n9}, {39, n10}, + {40, n1}, {41, n2}, {42, n3}, {43, n4}, {44, n5}, + {45, n6}, {46, n7}, {47, n8}, {48, n9}, {49, n10}, + {50, n1}, {51, n2}, {52, n3}, {53, n4}, {54, n5}, + {55, n6}, {56, n1}, {57, n2}, {58, n3}, {59, n10}, + {60, n5}, {61, n6}, {62, n7}, {63, n8}], + + SomeTime = {1632,989499,279637}, + FixedSeed = rand:seed(exrop, SomeTime), + {ok, R1} = + simple_transfer(R0, + {fun test_ring_fun/3, 3, n1}, + R0, + {FixedSeed, + [], + lists:keydelete(n1, 1, count_nodes(R0))}), + + NodeCounts = lists:keysort(2, count_nodes(R1)), + io:format("NodeCounts ~w~n", [NodeCounts]), + [{_LN, LC}|Rest] = NodeCounts, + [{_HN, HC}|_] = lists:reverse(Rest), + true = HC - LC == 2. + + +-endif. + diff --git a/src/riak_core_new_claim.erl b/src/riak_core_new_claim.erl deleted file mode 100644 index c77a0e299..000000000 --- a/src/riak_core_new_claim.erl +++ /dev/null @@ -1,39 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% riak_core: Core Riak Application -%% -%% Copyright (c) 2007-2011 Basho Technologies, Inc. All Rights Reserved. -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- -%% -%% @doc This module is a pass-thru to `riak_core_claim' for backwards -%% compatability. - --module(riak_core_new_claim). --export([new_wants_claim/2, new_choose_claim/2]). - -%% @deprecated -%% -%% @doc This exists for the sole purpose of backwards compatability. -new_wants_claim(Ring, Node) -> - riak_core_claim:wants_claim_v2(Ring, Node). - -%% @deprecated -%% -%% @doc This exists for the sole purpose of backwards compatability. -new_choose_claim(Ring, Node) -> - riak_core_claim:choose_claim_v2(Ring, Node). diff --git a/test/claim_simulation.erl b/test/claim_simulation.erl index 442b0a833..2bceb92ba 100644 --- a/test/claim_simulation.erl +++ b/test/claim_simulation.erl @@ -33,8 +33,7 @@ -define(get(K, PL, D), proplists:get_value(K, PL, D)). basic_test_() -> - {timeout, 60000, [fun basic_default/0, - fun basic_new/0]}. + {timeout, 60000, [fun basic_default/0]}. basic_default() -> Opts = [{suffix, "_default"}, @@ -47,17 +46,6 @@ basic_default() -> ], run(Opts). -basic_new() -> - Opts = [{suffix, "_new"}, - {wc_mf, {riak_core_new_claim, new_wants_claim}}, - {cc_mf, {riak_core_new_claim, new_choose_claim}}, - {target_n_val, 4}, - {ring_size, 32}, - {node_count, 8}, - {node_capacity, 24} - ], - run(Opts). - run(Opts) -> application:load(riak_core), @@ -96,7 +84,8 @@ run(Opts) -> {Sum, Curr} end, Owners1, Owners), - MeetTargetN = [riak_core_claim:meets_target_n(R, TargetN) || R <- Rings], + MeetTargetN = + [riak_core_membership_claim:meets_target_n(R, TargetN) || R <- Rings], FName = io_lib:format("/tmp/rings_~w_~w~s.txt", [RingSize, NodeCount, Suffix]), diff --git a/test/rack_awareness_test.erl b/test/rack_awareness_test.erl index 0fadbc880..83f374f30 100644 --- a/test/rack_awareness_test.erl +++ b/test/rack_awareness_test.erl @@ -207,9 +207,9 @@ do_generate_ring(Size, ContributorNodes) -> claim(NewRing). claim(Ring) -> - WantsClaimFun = {riak_core_claim, default_wants_claim}, - ChooseClaimFun = {riak_core_claim, default_choose_claim}, - riak_core_claim:claim(Ring, WantsClaimFun, ChooseClaimFun). + WantsClaimFun = {riak_core_membership_claim, default_wants_claim}, + ChooseClaimFun = {riak_core_membership_claim, default_choose_claim}, + riak_core_membership_claim:claim(Ring, WantsClaimFun, ChooseClaimFun). generate_site_names(Count) -> lists:map(fun(Name) -> binary_to_list(Name) end, diff --git a/test/riak_core_claim_statem.erl b/test/riak_core_claim_statem.erl index fd3b9f3c6..2884bc828 100644 --- a/test/riak_core_claim_statem.erl +++ b/test/riak_core_claim_statem.erl @@ -157,7 +157,11 @@ claim_args(_From, _To, #state{ring=Ring}) -> %% @doc claim - The actual operation claim(Ring) -> - R =riak_core_claim:claim(Ring, {riak_core_claim, wants_claim_v2}, {riak_core_claim, choose_claim_v2}), + R = + riak_core_membership_claim:claim( + Ring, + {riak_core_claim, wants_claim_v2}, + {riak_core_claim, choose_claim_v2}), R. %% @doc claim_next - Next state function From e1ce7842b9cb39b09e04d30e8e5ebb2ec782d4b7 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 27 Mar 2023 15:24:28 +0100 Subject: [PATCH 02/30] Remove deprecated v1 claim/wants --- src/riak_core_claim_sim.erl | 17 +- src/riak_core_membership_claim.erl | 239 ++++------------------------- 2 files changed, 36 insertions(+), 220 deletions(-) diff --git a/src/riak_core_claim_sim.erl b/src/riak_core_claim_sim.erl index b6c8a9a7e..d003e18fe 100644 --- a/src/riak_core_claim_sim.erl +++ b/src/riak_core_claim_sim.erl @@ -517,9 +517,7 @@ commission_tests_rest() -> ]. commission_claims() -> - [{{riak_core_membership_claim, wants_claim_v1}, - {riak_core_membership_claim, choose_claim_v1}}, - {{riak_core_membership_claim, wants_claim_v2}, + [{{riak_core_membership_claim, wants_claim_v2}, {riak_core_membership_claim, choose_claim_v2}}, {{riak_core_membership_claim, wants_claim_v3}, {riak_core_memberhsip_claim, choose_claim_v3}}]. @@ -547,18 +545,7 @@ run_test() -> {cmds, [[{join,a}],[{join,b}]]}, {print,false}, {return_ring, true}]), - ?assert(is_tuple(Ring2)), - {ok, Fh} = file:open("sim.out", [write]), - ?assertEqual(ok, run([{ring, Ring2}, - {target_n_val,4}, - {wants,{riak_core_membership_claim,wants_claim_v1}}, - {choose,{riak_core_membership_claim,choose_claim_v1}}, - {cmds, [[{join,3}]]}, - {analysis, [{failures, 2},{n_val, 3}]}, - {print,Fh}, - {return_ring, false}])), - file:close(Fh), - file:delete("sim.out"). + ?assert(is_tuple(Ring2)). %% Decided not to run by default, perhaps better as an diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 1b8095038..ef9a1337b 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -57,10 +57,8 @@ default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, never_wants_claim/1, never_wants_claim/2, random_choose_claim/1, random_choose_claim/2, random_choose_claim/3]). --export([wants_claim_v1/1, wants_claim_v1/2, - wants_claim_v2/1, wants_claim_v2/2, +-export([wants_claim_v2/1, wants_claim_v2/2, wants_claim_v3/1, wants_claim_v3/2, - choose_claim_v1/1, choose_claim_v1/2, choose_claim_v1/3, choose_claim_v2/1, choose_claim_v2/2, choose_claim_v2/3, choose_claim_v3/1, choose_claim_v3/2, choose_claim_v3/3, claim_rebalance_n/2, claim_diversify/3, claim_diagonal/3, @@ -69,12 +67,20 @@ -ifdef(TEST). -compile(export_all). + -ifdef(EQC). --export([prop_claim_ensures_unique_nodes/1, prop_wants/0, prop_wants_counts/0,eqc_check/2, - prop_claim_ensures_unique_nodes_v2/0, % prop_claim_ensures_unique_nodes_v3/0, - prop_take_idxs/0]). +-export( + [ + prop_claim_ensures_unique_nodes/1, + prop_wants/0, prop_wants_counts/0, + eqc_check/2, + prop_claim_ensures_unique_nodes_v2/0, + % prop_claim_ensures_unique_nodes_v3/0, + prop_take_idxs/0 + ]). -include_lib("eqc/include/eqc.hrl"). -endif. + -include_lib("eunit/include/eunit.hrl"). -endif. @@ -124,20 +130,10 @@ default_choose_claim(Ring) -> default_choose_claim(Ring, node()). default_choose_claim(Ring, Node) -> - case riak_core_ring:legacy_ring(Ring) of - true -> - choose_claim_v1(Ring, Node); - false -> - choose_claim_v2(Ring, Node) - end. + choose_claim_v2(Ring, Node). default_choose_claim(Ring, Node, Params) -> - case riak_core_ring:legacy_ring(Ring) of - true -> - choose_claim_v1(Ring, Node, Params); - false -> - choose_claim_v2(Ring, Node, Params) - end. + choose_claim_v2(Ring, Node, Params). %% @spec default_wants_claim(riak_core_ring()) -> {yes, integer()} | no %% @doc Want a partition if we currently have less than floor(ringsize/nodes). @@ -145,35 +141,7 @@ default_wants_claim(Ring) -> default_wants_claim(Ring, node()). default_wants_claim(Ring, Node) -> - case riak_core_ring:legacy_ring(Ring) of - true -> - wants_claim_v1(Ring, Node); - false -> - wants_claim_v2(Ring, Node) - end. - -%% @deprecated -wants_claim_v1(Ring) -> - wants_claim_v1(Ring, node()). - -%% @deprecated -wants_claim_v1(Ring0, Node) -> - Ring = riak_core_ring:upgrade(Ring0), - %% Calculate the expected # of partitions for a perfectly balanced ring. Use - %% this expectation to determine the relative balance of the ring. If the - %% ring isn't within +-2 partitions on all nodes, we need to rebalance. - ExpParts = get_expected_partitions(Ring, Node), - PCounts = lists:foldl(fun({_Index, ANode}, Acc) -> - orddict:update_counter(ANode, 1, Acc) - end, [{Node, 0}], riak_core_ring:all_owners(Ring)), - RelativeCounts = [I - ExpParts || {_ANode, I} <- PCounts], - WantsClaim = (lists:min(RelativeCounts) < -2) or (lists:max(RelativeCounts) > 2), - case WantsClaim of - true -> - {yes, 0}; - false -> - no - end. + wants_claim_v2(Ring, Node). wants_claim_v2(Ring) -> wants_claim_v2(Ring, node()). @@ -264,30 +232,6 @@ default_choose_params(Params) -> Params end. -%% @deprecated -choose_claim_v1(Ring) -> - choose_claim_v1(Ring, node()). - -%% @deprecated -choose_claim_v1(Ring0, Node) -> - choose_claim_v1(Ring0, Node, []). - -choose_claim_v1(Ring0, Node, Params0) -> - Params = default_choose_params(Params0), - Ring = riak_core_ring:upgrade(Ring0), - TargetN = proplists:get_value(target_n_val, Params), - case meets_target_n(Ring, TargetN) of - {true, TailViolations} -> - %% if target N is met, then it doesn't matter where - %% we claim vnodes, as long as we don't violate the - %% target N with any of our additions - %% (== claim partitions at least N steps apart) - claim_with_n_met(Ring, TailViolations, Node); - false -> - %% we don't meet target N yet, rebalance - claim_rebalance_n(Ring, Node) - end. - choose_claim_v2(Ring) -> choose_claim_v2(Ring, node()). @@ -455,10 +399,11 @@ increase_takes([], N, _Max, Acc) when N < 0 -> [{Node, Delta} || {Node, _Own, Delta} <- lists:usort(Acc)]; increase_takes([{Node, Own, Delta} | Rest], N, Max, Acc) when Delta > 0 -> WouldOwn = Own + Delta, - Additive = case WouldOwn +1 =< Max of - true -> 1; - false -> 0 - end, + Additive = + case (WouldOwn + 1) =< Max of + true -> 1; + false -> 0 + end, increase_takes(Rest, N+Additive, Max, [{Node, Own, Delta+Additive} | Acc]); increase_takes([NodeDelta | Rest], N, Max, Acc) -> increase_takes(Rest, N, Max, [NodeDelta | Acc]). @@ -466,13 +411,15 @@ increase_takes([NodeDelta | Rest], N, Max, Acc) -> meets_target_n(Ring, TargetN) -> Owners = lists:keysort(1, riak_core_ring:all_owners(Ring)), meets_target_n(Owners, TargetN, 0, [], []). + meets_target_n([{Part,Node}|Rest], TargetN, Index, First, Last) -> case lists:keytake(Node, 1, Last) of {value, {Node, LastIndex, _}, NewLast} -> if Index-LastIndex >= TargetN -> %% node repeat respects TargetN - meets_target_n(Rest, TargetN, Index+1, First, - [{Node, Index, Part}|NewLast]); + meets_target_n( + Rest, TargetN, Index + 1, First, + [{Node, Index, Part}|NewLast]); true -> %% violation of TargetN false @@ -487,11 +434,16 @@ meets_target_n([], TargetN, Index, First, Last) -> %% compute violations at wrap around, but don't fail %% because of them: handle during reclaim Violations = - lists:filter(fun({Node, L, _}) -> - {Node, F} = proplists:lookup(Node, First), - (Index-L)+F < TargetN - end, - Last), + lists:filter( + fun({Node, L, _}) -> + {Node, F} = proplists:lookup(Node, First), + if ((Index - L) + F) < TargetN -> + true; + true -> + false + end + end, + Last), {true, [ Part || {_, _, Part} <- Violations ]}. @@ -766,91 +718,7 @@ never_wants_claim(_,_) -> no. %% Private %% =================================================================== -%% @private -claim_hole(Ring, Mine, Owners, Node) -> - Choices = case find_biggest_hole(Mine) of - {I0, I1} when I0 < I1 -> - %% start-middle of the ring - lists:takewhile( - fun({I, _}) -> I /= I1 end, - tl(lists:dropwhile( - fun({I, _}) -> I /= I0 end, - Owners))); - {I0, I1} when I0 > I1 -> - %% wrap-around end-start of the ring - tl(lists:dropwhile( - fun({I, _}) -> I /= I0 end, Owners)) - ++lists:takewhile( - fun({I, _}) -> I /= I1 end, Owners); - {I0, I0} -> - %% node only has one claim - {Start, End} = - lists:splitwith( - fun({I, _}) -> I /= I0 end, - Owners), - tl(End)++Start - end, - Half = length(Choices) div 2, - {I, _} = lists:nth(Half, Choices), - riak_core_ring:transfer_node(I, Node, Ring). -%% @private -claim_with_n_met(Ring, TailViolations, Node) -> - CurrentOwners = lists:keysort(1, riak_core_ring:all_owners(Ring)), - Nodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), - case lists:sort([ I || {I, N} <- CurrentOwners, N == Node ]) of - [] -> - %% node hasn't claimed anything yet - just claim stuff - Spacing = length(Nodes), - [{First,_}|OwnList] = - case TailViolations of - [] -> - %% no wrap-around problems - choose whatever - lists:nthtail(Spacing-1, CurrentOwners); - [TV|_] -> - %% attempt to cure a wrap-around problem - lists:dropwhile( - fun({I, _}) -> I /= TV end, - lists:reverse(CurrentOwners)) - end, - {_, NewRing} = lists:foldl( - fun({I, _}, {0, Acc}) -> - {Spacing, riak_core_ring:transfer_node(I, Node, Acc)}; - (_, {S, Acc}) -> - {S-1, Acc} - end, - {Spacing, riak_core_ring:transfer_node(First, Node, Ring)}, - OwnList), - NewRing; - Mine -> - %% node already has claims - respect them - %% pick biggest hole & sit in the middle - %% rebalance will cure any mistake on the next pass - claim_hole(Ring, Mine, CurrentOwners, Node) - end. - -%% @private -find_biggest_hole(Mine) -> - lists:foldl(fun({I0, I1}, none) -> - {I0, I1}; - ({I0, I1}, {C0, C1}) when I0 < I1-> - %% start-middle of the ring - if I1-I0 > C1-C0 -> - {I0, I1}; - true -> - {C0, C1} - end; - ({I0, I1}, {C0, C1}) -> - %% wrap-around end-start of the ring - Span = I1+trunc(math:pow(2, 160))-1-I0, - if Span > C1-C0 -> - {I0, I1}; - true -> - {C0, C1} - end - end, - none, - lists:zip(Mine, tl(Mine)++[hd(Mine)])). %% @private %% @@ -896,22 +764,6 @@ add_default_deltas(IdxOwners, Deltas, Default) -> Defaults = [{Member, Default} || Member <- Owners2], lists:ukeysort(1, Deltas ++ Defaults). -%% @private -get_expected_partitions(Ring, Node) -> - riak_core_ring:num_partitions(Ring) div get_member_count(Ring, Node). - -%% @private -get_member_count(Ring, Node) -> - %% Determine how many nodes are involved with the ring; if the requested - %% node is not yet part of the ring, include it in the count. - AllMembers = riak_core_ring:claiming_members(Ring), - case lists:member(Node, AllMembers) of - true -> - length(AllMembers); - false -> - length(AllMembers) + 1 - end. - %% @private %% %% @doc Filter out candidate indices that would violate target_n given @@ -1312,29 +1164,6 @@ wants_claim_test() -> riak_core_ring_manager:cleanup_ets(test), riak_core_ring_manager:stop(). -find_biggest_hole_test() -> - Max = trunc(math:pow(2, 160)), - Part16 = Max/16, - - %% single partition claimed - ?assertEqual({Part16*5, Part16*5}, - find_biggest_hole([Part16*5])), - - %% simple hole is in the middle - ?assertEqual({Part16*3, Part16*13}, - find_biggest_hole([Part16*3, Part16*13])), - %% complex hole in the middle - ?assertEqual({Part16*5, Part16*10}, - find_biggest_hole([Part16*3, Part16*5, - Part16*10, Part16*15])), - - %% simple hole is around the end - ?assertEqual({Part16*10, Part16*8}, - find_biggest_hole([Part16*8, Part16*10])), - %% complex hole is around the end - ?assertEqual({Part16*13, Part16*3}, - find_biggest_hole([Part16*3, Part16*7, - Part16*10, Part16*13])). %% @private console helper function to return node lists for claiming %% partitions From 28fbcf90af7d1bbedb84aa99dffdf1181b58f54a Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 27 Mar 2023 16:12:21 +0100 Subject: [PATCH 03/30] Update framework for Claim Stops fake wants being required to prompt claim on a location change. Allow for a claim module to implement a sort_members_for_choose(Ring, Members, Owners) -> SortedMembers function, to pre-sort the members being passed into claim_rebalance. Add further specs. --- src/riak_core_membership_claim.erl | 162 ++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 38 deletions(-) diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index ef9a1337b..7d45a8760 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -86,20 +86,103 @@ -define(DEF_TARGET_N, 4). +-type choose_function() :: + {module(), atom()}|{module(), atom(), list(tuple())}. -type delta() :: {node(), Ownership::non_neg_integer(), Delta::integer()}. -type deltas() :: [delta()]. +%% =================================================================== +%% Claim API and supporting functions +%% =================================================================== + +-spec claim( + riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). claim(Ring) -> Want = app_helper:get_env(riak_core, wants_claim_fun), - Choose = app_helper:get_env(riak_core, choose_claim_fun), + Choose = + case app_helper:get_env(riak_core, choose_claim_fun) of + choose_claim_v2 -> + {riak_core_memberhsip_claim, choose_claim_v2}; + choose_claim_v3 -> + {riak_core_memberhsip_claim, choose_claim_v3}; + choose_claim_v4 -> + {riak_core_memberhsip_claim, choose_claim_v4}; + {CMod, CFun} -> + {CMod, CFun} + end, claim(Ring, Want, Choose). -claim(Ring, Want, Choose) -> +%% @doc claim/3 is used in tests as it allows for {Mod, Fun, Params} to be +%% passed in as the choose function, to override selection of defaults from +%% application environment for target n_vals. +-spec claim( + riak_core_ring:riak_core_ring(), + {module(), atom()}, + choose_function()) -> riak_core_ring:riak_core_ring(). +claim(Ring, {WMod, WFun}=Want, Choose) -> Members = riak_core_ring:claiming_members(Ring), - lists:foldl(fun(Node, Ring0) -> - claim_until_balanced(Ring0, Node, Want, Choose) - end, Ring, Members). + Owners = + lists:usort( + lists:map( + fun({_Idx, N}) -> N end, + riak_core_ring:all_owners(Ring))), + NoInitialWants = + lists:all( + fun(N) -> apply(WMod, WFun, [Ring, N]) == no end, Members), + SortedMembers = sort_members_for_choose(Ring, Members, Owners, Choose), + case NoInitialWants of + true -> + case riak_core_ring:has_location_changed(Ring) of + true -> + [HeadMember|_Rest] = SortedMembers, + choose_new_ring( + riak_core_ring:clear_location_changed(Ring), + HeadMember, + Choose); + false -> + Ring + end; + false -> + lists:foldl( + fun(Node, Ring0) -> + claim_until_balanced(Ring0, Node, Want, Choose) + end, + riak_core_ring:clear_location_changed(Ring), + SortedMembers) + end. + +-spec choose_new_ring( + riak_core_ring:riak_core_ring(),node(), choose_function()) -> + riak_core_ring:riak_core_ring(). +choose_new_ring(Ring, Node, Choose) -> + case Choose of + {CMod, CFun} -> + CMod:CFun(Ring, Node); + {CMod, CFun, Params} -> + CMod:CFun(Ring, Node, Params) + end. +%% @doc +%% The order by which members are passed in to claim may make a difference +%% to the outcome, so prepare to allow for this order to be changeable in +%% different claim versions +-spec sort_members_for_choose( + riak_core_ring:riak_core_ring(), + list(node()), + list(node()), + choose_function()) -> list(node()). +sort_members_for_choose(Ring, Members, Owners, Choose) -> + CMod = element(1, Choose), + case erlang:function_exported(CMod, sort_members_for_choose, 3) of + true -> + CMod:sort_members_for_choose(Ring, Members, Owners); + false -> + Members + end. + +-spec claim_until_balanced( + riak_core_ring:riak_core_ring(), node()) -> + riak_core_ring:riak_core_ring(). claim_until_balanced(Ring, Node) -> Want = app_helper:get_env(riak_core, wants_claim_fun), Choose = app_helper:get_env(riak_core, choose_claim_fun), @@ -111,21 +194,19 @@ claim_until_balanced(Ring, Node, {WMod, WFun}=Want, Choose) -> no -> Ring; {yes, _NumToClaim} -> - NewRing = case Choose of - {CMod, CFun} -> - CMod:CFun(Ring, Node); - {CMod, CFun, Params} -> - CMod:CFun(Ring, Node, Params) - end, + NewRing = + case Choose of + {CMod, CFun} -> + CMod:CFun(Ring, Node); + {CMod, CFun, Params} -> + CMod:CFun(Ring, Node, Params) + end, claim_until_balanced(NewRing, Node, Want, Choose) end. -%% =================================================================== -%% Claim Function Implementations -%% =================================================================== - -%% @spec default_choose_claim(riak_core_ring()) -> riak_core_ring() %% @doc Choose a partition at random. +-spec default_choose_claim( + riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). default_choose_claim(Ring) -> default_choose_claim(Ring, node()). @@ -135,17 +216,41 @@ default_choose_claim(Ring, Node) -> default_choose_claim(Ring, Node, Params) -> choose_claim_v2(Ring, Node, Params). -%% @spec default_wants_claim(riak_core_ring()) -> {yes, integer()} | no %% @doc Want a partition if we currently have less than floor(ringsize/nodes). +-spec default_wants_claim( + riak_core_ring:riak_core_ring()) -> {yes, integer()} | no. default_wants_claim(Ring) -> default_wants_claim(Ring, node()). default_wants_claim(Ring, Node) -> wants_claim_v2(Ring, Node). +%% Provide default choose parameters if none given +default_choose_params() -> + default_choose_params([]). + +default_choose_params(Params) -> + case proplists:get_value(target_n_val, Params) of + undefined -> + TN = app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N), + [{target_n_val, TN} | Params]; + _-> + Params + end. + +%% =================================================================== +%% Claim Function Implementations +%% =================================================================== + +-spec wants_claim_v2( + riak_core_ring:riak_core_ring()) -> + no|{yes, location_change|non_neg_integer()}. wants_claim_v2(Ring) -> wants_claim_v2(Ring, node()). +-spec wants_claim_v2( + riak_core_ring:riak_core_ring(), node()) -> + no|{yes, non_neg_integer()}. wants_claim_v2(Ring, Node) -> Active = riak_core_ring:claiming_members(Ring), Owners = riak_core_ring:all_owners(Ring), @@ -156,12 +261,7 @@ wants_claim_v2(Ring, Node) -> Count = proplists:get_value(Node, Counts, 0), case Count < Avg of false -> - case riak_core_ring:has_location_changed(Ring) of - true -> - {yes, 1}; - false -> - no - end; + no; true -> {yes, Avg - Count} end. @@ -219,19 +319,6 @@ wants_claim_v3(Ring, _Node) -> end end. -%% Provide default choose parameters if none given -default_choose_params() -> - default_choose_params([]). - -default_choose_params(Params) -> - case proplists:get_value(target_n_val, Params) of - undefined -> - TN = app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N), - [{target_n_val, TN} | Params]; - _-> - Params - end. - choose_claim_v2(Ring) -> choose_claim_v2(Ring, node()). @@ -239,8 +326,7 @@ choose_claim_v2(Ring, Node) -> Params = default_choose_params(), choose_claim_v2(Ring, Node, Params). -choose_claim_v2(RingOrig, Node, Params0) -> - Ring = riak_core_ring:clear_location_changed(RingOrig), +choose_claim_v2(Ring, Node, Params0) -> Params = default_choose_params(Params0), %% Active::[node()] Active = riak_core_ring:claiming_members(Ring), From eb13a48dc5868ada7c58f5a7a492317d088ab921 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 27 Mar 2023 19:18:31 +0100 Subject: [PATCH 04/30] Add choose_claim_v4 --- src/riak_core_claim_location.erl | 1050 ++++++++++++++++++++++++++++ src/riak_core_location.erl | 66 +- src/riak_core_membership_claim.erl | 73 +- 3 files changed, 1146 insertions(+), 43 deletions(-) create mode 100644 src/riak_core_claim_location.erl diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl new file mode 100644 index 000000000..ce4bf1848 --- /dev/null +++ b/src/riak_core_claim_location.erl @@ -0,0 +1,1050 @@ +%% ------------------------------------------------------------------- +%% +%% riak_core: Core Riak Application +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +%% @doc choose and sequential claim functions for a more location friendly +%% claim algorithm + +-module(riak_core_claim_location). + +-export( + [ + choose_claim_v4/2, choose_claim_v4/3, + sequential_claim/2, sequential_claim/3, + sort_members_for_choose/3 + ]). + +sort_members_for_choose(Ring, Members, Owners) -> + NodesLocations = riak_core_ring:get_nodes_locations(Ring), + case riak_core_location:has_location_set_in_cluster(NodesLocations) of + false -> + Members; + true -> + LocationNodesD = + riak_core_location:get_location_nodes(Members, NodesLocations), + InitSort = initial_location_sort(dict:to_list(LocationNodesD)), + lists:append(lists:subtract(InitSort, Owners), Owners) + end. + +initial_location_sort(LocationNodeList) -> + NodeLists = + sort_lists_by_length( + lists:map(fun({_L, NL}) -> NL end, LocationNodeList)), + roll_nodelists(NodeLists, []). + +roll_nodelists(NodeLists, ListOfNodes) -> + case length(hd(NodeLists)) of + L when L > 1 -> + {UpdNodeLists, UpdListOfNodes} = + lists:mapfoldl( + fun(NL, Acc) -> + case length(NL) of + L when L > 1 -> + [H|T] = NL, + {T, [H|Acc]}; + _ -> + {NL, Acc} + end + end, + ListOfNodes, + NodeLists), + roll_nodelists(UpdNodeLists, UpdListOfNodes); + 1 -> + ListOfNodes ++ lists:flatten(NodeLists) + end. + +choose_claim_v4(Ring, Node) -> + Params = riak_core_membership_claim:default_choose_params(), + choose_claim_v4(Ring, Node, Params). + +choose_claim_v4(Ring, Node, Params0) -> + Params = riak_core_membership_claim:default_choose_params(Params0), + Active = riak_core_ring:claiming_members(Ring), + Owners = riak_core_ring:all_owners(Ring), + Ownerships = riak_core_membership_claim:get_counts(Active, Owners), + RingSize = riak_core_ring:num_partitions(Ring), + NodeCount = erlang:length(Active), + {MinVnodes, MaxVnodes, Deltas} + = assess_deltas(RingSize, NodeCount, Ownerships), + {Node, CurrentOwnerships} = + lists:keyfind(Node, 1, Ownerships), + Want = MaxVnodes - CurrentOwnerships, + TargetN = proplists:get_value(target_n_val, Params), + + NodesToClaim = lists:filter(fun({_N, O}) -> O == 0 end, Ownerships), + NodesAllClaimed = + case NodesToClaim of + [{Node, _}] -> + true; + [] -> + true; + _ -> + false + end, + + ZippedIndices = + lists:zip( + lists:seq(0, length(Owners) - 1), + [Idx || {Idx, _} <- Owners] + ), + AllIndices = + case NodesAllClaimed of + true -> + ZippedIndices; + false -> + StripeCount = max(1, (length(Active) - 1)), + StripeList = + lists:map( + fun({Nth, I}) -> {Nth rem StripeCount, Nth, I} end, + ZippedIndices), + Counter = + dict:from_list( + lists:map( + fun(I) -> {I, 0} end, + lists:seq(0, StripeCount - 1)) + ), + Counted = + lists:foldl( + fun({R, _Nth, _I}, C) -> + dict:update_counter(R, 1, C) + end, + Counter, + StripeList), + lists:map( + fun({_OD, _RC, _R, Nth, I}) -> {Nth, I} end, + lists:sort( + lists:map( + fun({R, Nth, I}) -> + {I, Owner} = lists:keyfind(I, 1, Owners), + {Owner, Delta} = lists:keyfind(Owner, 1, Deltas), + {Delta, dict:fetch(R, Counted), R, Nth, I} + end, + lists:reverse(StripeList) + ) + ) + ) + end, + + EnoughNodes = + (NodeCount > TargetN) + or ((NodeCount == TargetN) and (RingSize rem TargetN =:= 0)), + + case EnoughNodes of + true -> + %% If we have enough nodes to meet target_n, then we prefer to + %% claim indices that are currently causing violations, and then + %% fallback to indices in linear order. The filtering steps below + %% will ensure no new violations are introduced. + NodeViolations = find_node_violations(Ring, TargetN), + LocationViolations = + lists:subtract( + find_location_violations(Ring, TargetN), NodeViolations), + {DirtyNodeIndices, OtherIndices} = + lists:splitwith( + fun({_Nth, Idx}) -> + lists:member(Idx, NodeViolations) + end, + AllIndices), + {DirtyLocationIndices, CleanIndices} = + lists:splitwith( + fun({_Nth, Idx}) -> + lists:member(Idx, LocationViolations) + end, + OtherIndices + ), + Indices = DirtyNodeIndices ++ DirtyLocationIndices ++ CleanIndices; + false -> + %% If we do not have enough nodes to meet target_n, then we prefer + %% claiming the same indices that would occur during a + %% re-diagonalization of the ring with target_n nodes, falling + %% back to linear offsets off these preferred indices when the + %% number of indices desired is less than the computed set. + Padding = lists:duplicate(TargetN, undefined), + Expanded = lists:sublist(Active ++ Padding, TargetN), + ExpandedLocation = get_nodes_by_location(Expanded, Ring), + PreferredClaim = + riak_core_membership_claim:diagonal_stripe( + Ring, ExpandedLocation), + PreferredNth = [begin + {Nth, Idx} = lists:keyfind(Idx, 2, AllIndices), + Nth + end || {Idx,Owner} <- PreferredClaim, + Owner =:= Node], + Offsets = lists:seq(0, RingSize div length(PreferredNth)), + AllNth = lists:sublist([(X+Y) rem RingSize || Y <- Offsets, + X <- PreferredNth], + RingSize), + Indices = [lists:keyfind(Nth, 1, AllIndices) || Nth <- AllNth] + end, + + %% Filter out indices that conflict with the node's existing ownership + ClaimableIdxs = + prefilter_violations( + Ring, Node, AllIndices, Indices, TargetN, RingSize), + + %% Claim indices from the remaining candidate set + Claim2 = + case select_indices( + Owners, Deltas, ClaimableIdxs, TargetN, RingSize) of + [] -> + []; + Claim -> + lists:sublist(Claim, Want) + end, + NewRing = + lists:foldl( + fun(Idx, Ring0) -> + riak_core_ring:transfer_node(Idx, Node, Ring0) + end, + Ring, + Claim2), + + BadRing = + riak_core_membership_claim:meets_target_n(NewRing, TargetN) == false, + DeficientClaim = (length(Claim2) + CurrentOwnerships) < MinVnodes, + BadClaim = EnoughNodes and BadRing and NodesAllClaimed, + + case BadClaim or DeficientClaim of + true -> + %% Unable to claim, fallback to re-diagonalization + sequential_claim(Ring, Node, TargetN); + _ -> + NewRing + end. + + +%% @doc +%% Assess what the mnimum and maximum number of vnodes which should be owned by +%% each node, and return a list of nodes with the Deltas from the minimum i.e. +%% where a node has more vnodes than the minimum the delta will be a negative +%% number indicating the number of vnodes it can offer to a node with wants. +-spec assess_deltas( + pos_integer(), pos_integer(), [{node(), non_neg_integer()}]) -> + {non_neg_integer(), pos_integer(), [{node(), integer()}]}. +assess_deltas(RingSize, NodeCount, Ownerships) -> + MinVnodes = RingSize div NodeCount, + MaxVnodes = + case RingSize rem NodeCount of + 0 -> + MinVnodes; + _ -> + MinVnodes + 1 + end, + Deltas = + lists:map(fun({N, VNs}) -> {N, MinVnodes - VNs} end, Ownerships), + {MinVnodes, MaxVnodes, Deltas}. + + +%% @private +%% +%% @doc Filter out candidate indices that would violate target_n given +%% a node's current partition ownership. Only interested in indices which +%% are not currently owned within a location +-spec prefilter_violations( + riak_core_ring:riak_core_ring(), + node(), + list({non_neg_integer(), non_neg_integer()}), + list({non_neg_integer(), non_neg_integer()}), + pos_integer(), + pos_integer()) -> list({non_neg_integer(), non_neg_integer()}). +prefilter_violations(Ring, Node, AllIndices, Indices, TargetN, RingSize) -> + CurrentIndices = + indices_nth_subset(AllIndices, riak_core_ring:indices(Ring, Node)), + case riak_core_location:support_locations_claim(Ring, TargetN) of + true -> + OtherLocalNodes = + riak_core_location:local_nodes(Ring, Node), + LocalIndices = + indices_nth_subset( + AllIndices, + lists:flatten( + lists:map( + fun(N) -> riak_core_ring:indices(Ring, N) end, + [Node|OtherLocalNodes]))), + SafeRemoteIndices = + safe_indices( + lists:subtract(Indices, LocalIndices), + LocalIndices, TargetN, RingSize), + SafeLocalIndices = + safe_indices( + lists:subtract( + lists:filter( + fun(NthIdx) -> lists:member(NthIdx, Indices) end, + LocalIndices), + CurrentIndices), + CurrentIndices, TargetN, RingSize), + SafeRemoteIndices ++ SafeLocalIndices; + false -> + safe_indices( + lists:subtract(AllIndices, CurrentIndices), + CurrentIndices, TargetN, RingSize) + end. + +-spec indices_nth_subset( + list({non_neg_integer(), non_neg_integer()}), + list(non_neg_integer())) -> + list({non_neg_integer(), non_neg_integer()}). +indices_nth_subset(IndicesNth, Indices) -> + lists:filter(fun({_N, Idx}) -> lists:member(Idx, Indices) end, IndicesNth). + +-spec safe_indices( + list({non_neg_integer(), non_neg_integer()}), + list({non_neg_integer(), non_neg_integer()}), + pos_integer(), + pos_integer()) -> + list({non_neg_integer(), non_neg_integer()}). +safe_indices( + IndicesToCheck, LocalIndicesToAvoid, TargetN, RingSize) -> + lists:filter( + fun({Nth, _Idx}) -> + lists:all( + fun({CNth, _}) -> + riak_core_membership_claim:spaced_by_n( + CNth, Nth, TargetN, RingSize) + end, + LocalIndicesToAvoid) + end, + IndicesToCheck + ). + +%% @private +%% +%% @doc Select indices from a given candidate set, according to two +%% goals. +%% +%% 1. Ensure greedy/local target_n spacing between indices. Note that this +%% goal intentionally does not reject overall target_n violations. +%% +%% 2. Select indices based on the delta between current ownership and +%% expected ownership. In other words, if A owns 5 partitions and +%% the desired ownership is 3, then we try to claim at most 2 partitions +%% from A. +select_indices(_Owners, _Deltas, [], _TargetN, _RingSize) -> + []; +select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> + OwnerDT = dict:from_list(Owners), + %% Claim partitions and check that subsequent partitions claimed by this + %% node do not break the target_n invariant. + {Claims, _NClaims, _Deltas} = + lists:foldl( + fun({Nth, Idx}, {IdxClaims, NthClaims, DeltaDT}) -> + Owner = dict:fetch(Idx, OwnerDT), + Delta = dict:fetch(Owner, DeltaDT), + MeetsTN = + lists:all( + fun(ClaimedNth) -> + riak_core_membership_claim:spaced_by_n( + ClaimedNth, Nth, TargetN, RingSize) + end, + NthClaims), + case (Delta < 0) and MeetsTN of + true -> + NextDeltaDT = + dict:update_counter(Owner, 1, DeltaDT), + {[Idx|IdxClaims], [Nth|NthClaims], NextDeltaDT}; + false -> + {IdxClaims, NthClaims, DeltaDT} + end + end, + {[], [], dict:from_list(Deltas)}, + Indices), + lists:reverse(Claims). + + +%% @private +%% +%% @doc Determines indices that violate the given target_n spacing +%% property. +-spec find_node_violations( + riak_core_ring:riak_core_ring(), pos_integer()) + -> list(non_neg_integer()). +find_node_violations(Ring, TargetN) -> + Owners = riak_core_ring:all_owners(Ring), + find_violations(Owners, TargetN). + +-spec find_location_violations( + riak_core_ring:riak_core_ring(), pos_integer()) + -> list(non_neg_integer()). +find_location_violations(Ring, TargetN) -> + case riak_core_location:support_locations_claim(Ring, TargetN) of + true -> + find_violations( + riak_core_location:get_location_owners(Ring), TargetN); + false -> + [] + end. + +-spec find_violations( + list({non_neg_integer(), atom()}), pos_integer()) + -> list(non_neg_integer()). +find_violations(Owners, TargetN) -> + Suffix = lists:sublist(Owners, TargetN - 1), + %% Add owners at the front to the tail, to confirm no tail violations + OwnersWithTail = Owners ++ Suffix, + %% Use a sliding window to determine violations + {Bad, _} = + lists:foldl( + fun(P={Idx, Owner}, {Out, Window}) -> + Window2 = lists:sublist([P|Window], TargetN-1), + case lists:keyfind(Owner, 2, Window) of + {_PrevIdx, Owner} -> + {[Idx | Out], Window2}; + false -> + {Out, Window2} + end + end, + {[], lists:sublist(Owners, 2, TargetN - 1)}, + OwnersWithTail), + lists:usort(Bad). + +-spec sequential_claim( + riak_core_ring:riak_core_ring(), node()) -> + riak_core_ring:riak_core_ring(). +sequential_claim(Ring, Node) -> + TN = riak_core_membership_claim:get_target_n(), + sequential_claim(Ring, Node, TN). + +%% @private fall back to diagonal striping vnodes across nodes in a +%% sequential round robin (eg n1 | n2 | n3 | n4 | n5 | n1 | n2 | n3 +%% etc) However, different to `claim_rebalance_n', this function +%% attempts to eliminate tail violations (for example a ring that +%% starts/ends n1 | n2 | ...| n3 | n4 | n1) +-spec sequential_claim( + riak_core_ring:riak_core_ring(), node(), integer()) -> + riak_core_ring:riak_core_ring(). +sequential_claim(Ring0, Node, TargetN) -> + Ring = riak_core_ring:upgrade(Ring0), + OrigNodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), + Nodes = get_nodes_by_location(OrigNodes, Ring), + NodeCount = length(Nodes), + RingSize = riak_core_ring:num_partitions(Ring), + + Overhang = RingSize rem NodeCount, + HasTailViolation = (Overhang > 0 andalso Overhang < TargetN), + Shortfall = TargetN - Overhang, + SolveableNodeViolation = + solveable_violation(RingSize, NodeCount, TargetN, Shortfall) + and HasTailViolation, + + LocationsSupported = + riak_core_location:support_locations_claim(Ring, TargetN), + {SolveableLocationViolation, LocationShortfall} = + case {LocationsSupported, Overhang, RingSize div NodeCount} of + {true, OH, Loops} when OH > 0, OH > TargetN, Loops > 1 -> + MinDistance = + check_for_location_tail_violation( + Nodes, Ring, OH, TargetN), + case MinDistance of + MD when MD =< TargetN -> + SLV = + solveable_violation( + RingSize, NodeCount, TargetN, TargetN - MD), + {SLV, TargetN - MD}; + _ -> + {false, 0} + end; + _ -> + {false, 0} + end, + + Partitions = lists:sort([ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), + Zipped = + case {SolveableLocationViolation, SolveableNodeViolation} of + {true, _} -> + Nodelist = + solve_tail_violations(RingSize, Nodes, LocationShortfall), + lists:zip(Partitions, Nodelist); + {_, true} -> + Nodelist = + solve_tail_violations(RingSize, Nodes, Shortfall), + lists:zip(Partitions, Nodelist); + _ -> + riak_core_membership_claim:diagonal_stripe(Ring, Nodes) + end, + + lists:foldl( + fun({P, N}, Acc) -> riak_core_ring:transfer_node(P, N, Acc) end, + Ring, + Zipped). + + +-spec check_for_location_tail_violation( + list(node()), + riak_core_ring:riak_core_ring(), + pos_integer(), + pos_integer()) -> pos_integer(). +check_for_location_tail_violation(Nodes, Ring, OH, TargetN) -> + LastNodes = lists:sublist(Nodes, 1 + OH - TargetN, TargetN), + FirstNodes = lists:sublist(Nodes, TargetN), + LocationD = riak_core_ring:get_nodes_locations(Ring), + LocationFinder = + fun(N) -> riak_core_location:get_node_location(N, LocationD) end, + LastLocations = lists:map(LocationFinder, LastNodes), + FirstLocations = + lists:zip( + lists:map(LocationFinder, FirstNodes), + lists:seq(0, TargetN - 1)), + {MinDistance, _} = + lists:foldl( + fun(L, {MinStep, TailStep}) -> + case lists:keyfind(L, 1, FirstLocations) of + {L, N} -> + {min(TailStep + N, MinStep), TailStep - 1}; + false -> + {MinStep, TailStep - 1} + end + end, + {TargetN, TargetN - 1}, + LastLocations), + MinDistance. + + +-spec solveable_violation( + pos_integer(), pos_integer(), pos_integer(), pos_integer()) -> boolean(). +solveable_violation(RingSize, NodeCount, TargetN, Shortfall) -> + case RingSize div NodeCount of + LoopCount when LoopCount >= Shortfall -> + true; + LoopCount -> + SplitSize = Shortfall div LoopCount, + BiggestTake = Shortfall - ((LoopCount - 1) * SplitSize), + (NodeCount - BiggestTake) >= TargetN + end. + +%% @doc +%% The node list mosut be of length ring size. It is made up of a set of +%% complete loops of the node list, and then a partial loop with the addition +%% of the shortfall. The for each node in the shortfall a node in the complete +%% loops must be removed +-spec solve_tail_violations( + pos_integer(), [node()], non_neg_integer()) -> [[node()]]. +solve_tail_violations(RingSize, Nodes, Shortfall) -> + {LastLoop, Remainder} = + lists:split(RingSize rem length(Nodes), Nodes), + ExcessLoop = lists:sublist(Remainder, Shortfall), + Tail = LastLoop ++ ExcessLoop, + LoopCount = RingSize div length(Nodes), + RemoveList = + divide_list_for_removes(lists:reverse(ExcessLoop), LoopCount), + CompleteLoops = + lists:append( + lists:duplicate(LoopCount - length(RemoveList), Nodes)), + PartialLoops = + lists:map( + fun(ENL) -> lists:subtract(Nodes, ENL) end, + RemoveList), + CompleteLoops ++ lists:append(PartialLoops) ++ Tail. + +%% @doc +%% Normally need to remove one of the excess nodes each loop around the node +%% list. However, if there are not enough loops, more than one can be removed +%% per loop - assuming the solveable_violation/4 condition passes (i.e. this +%% will not breach the TargetN). +-spec divide_list_for_removes(list(node()), pos_integer()) + -> list(list(node())). +divide_list_for_removes(Excess, LoopCount) when LoopCount >= length(Excess) -> + lists:map(fun(N) -> [N] end, Excess); +divide_list_for_removes(Excess, 1) -> + [Excess]; +divide_list_for_removes(Excess, LoopCount) -> + FetchesPerLoop = length(Excess) div LoopCount, + LastFetch = length(Excess) - FetchesPerLoop * (LoopCount - 1), + {[], GroupedFetches} = + lists:foldl( + fun(FC, {ENs, GroupedENs}) -> + {NextGroup, Remainder} = lists:split(FC, ENs), + {Remainder, GroupedENs ++ [NextGroup]} + end, + {Excess, []}, + lists:duplicate(LoopCount - 1, FetchesPerLoop) ++ [LastFetch] + ), + GroupedFetches. + +%% @private +%% Get active nodes ordered by taking location parameters into account +-spec get_nodes_by_location([node()|undefined], riak_core_ring:riak_core_ring()) -> + [node()|undefined]. +get_nodes_by_location(Nodes, Ring) -> + NodesLocations = riak_core_ring:get_nodes_locations(Ring), + case riak_core_location:has_location_set_in_cluster(NodesLocations) of + false -> + Nodes; + true -> + LocationNodesD = + riak_core_location:get_location_nodes(Nodes, NodesLocations), + stripe_nodes_by_location(LocationNodesD) + end. + +-spec stripe_nodes_by_location(dict:dict()) -> list(node()|undefined). +stripe_nodes_by_location(NodesByLocation) -> + [LNodes|RestLNodes] = + sort_lists_by_length( + lists:map(fun({_L, NL}) -> NL end, dict:to_list(NodesByLocation))), + stripe_nodes_by_location(RestLNodes, lists:map(fun(N) -> [N] end, LNodes)). + +stripe_nodes_by_location([], Acc) -> + lists:flatten(Acc); +stripe_nodes_by_location([LNodes|OtherLNodes], Acc) -> + SortedAcc = sort_lists_by_length(Acc), + {UpdatedAcc, []} = + lists:mapfoldl( + fun(NodeList, LocationNodesToAdd) -> + case LocationNodesToAdd of + [NodeToAdd|TailNodes] -> + {NodeList ++ [NodeToAdd], TailNodes}; + [] -> + {NodeList, []} + end + end, + LNodes, + SortedAcc), + stripe_nodes_by_location(OtherLNodes, UpdatedAcc). + +sort_lists_by_length(ListOfLists) -> + lists:sort(fun(L1, L2) -> length(L1) >= length(L2) end, ListOfLists). + + +%% =================================================================== +%% eunit tests +%% =================================================================== + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +prefilter_violations_test_() -> + % Be strict on test timeout. Unrefined code took > 10s, whereas the + % refactored code should be << 1s. + {timeout, 5, fun prefilter_violations_perf/0}. + +prefilter_violations_perf() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, + {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, + {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, + {l7n1, loc7}, {l7n2, loc7}], + N1 = l1n1, + N1Loc = loc1, + RingSize = 4096, + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|JoiningNodes], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RAll = + lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + JoiningNodes + ), + Owners = riak_core_ring:all_owners(RAll), + AllIndices = + lists:zip( + lists:seq(0, length(Owners)-1), [Idx || {Idx, _} <- Owners]), + + {T0, FilteredIndices0} = + timer:tc( + fun prefilter_violations/6, + [RAll, l1n2, AllIndices, AllIndices, 4, RingSize]), + io:format("Prefilter violations took ~w ms~n", [T0 div 1000]), + ?assertMatch(RingSize, length(FilteredIndices0)), + + {T1, FilteredIndices1} = + timer:tc( + fun prefilter_violations/6, + [RAll, l2n3, AllIndices, AllIndices, 4, RingSize]), + io:format("Prefilter violations took ~w ms~n", [T1 div 1000]), + ?assertMatch(RingSize, length(FilteredIndices1)), + + RTrans = riak_core_ring:transfer_node(0, l2n3, RAll), + {T2, FilteredIndices2} = + timer:tc( + fun prefilter_violations/6, + [RTrans, l2n3, AllIndices, AllIndices, 4, RingSize]), + io:format("Prefilter violations took ~w ms~n", [T2 div 1000]), + ?assertMatch(RingSize, length(FilteredIndices2) + 7), + + {T3, FilteredIndices3} = + timer:tc( + fun prefilter_violations/6, + [RTrans, l1n2, AllIndices, AllIndices, 4, RingSize]), + io:format("Prefilter violations took ~w ms~n", [T3 div 1000]), + io:format("Filtered instances ~w~n", [AllIndices -- FilteredIndices3]), + ?assertMatch(RingSize, length(FilteredIndices3) + 1), + + {T4, FilteredIndices4} = + timer:tc( + fun prefilter_violations/6, + [RTrans, l2n4, AllIndices, AllIndices, 4, RingSize]), + io:format("Prefilter violations took ~w ms~n", [T4 div 1000]), + ?assertMatch(RingSize, length(FilteredIndices4) + 7 - 1). + +location_seqclaim_t1_test() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3}, + {n7, loc4}, {n8, loc4}, + {n9, loc5}, {n10, loc5} + ], + location_claim_tester(n1, loc1, JoiningNodes, 64), + location_claim_tester(n1, loc1, JoiningNodes, 128), + location_claim_tester(n1, loc1, JoiningNodes, 256), + location_claim_tester(n1, loc1, JoiningNodes, 512), + location_claim_tester(n1, loc1, JoiningNodes, 1024), + location_claim_tester(n1, loc1, JoiningNodes, 2048). + +location_seqclaim_t2_test() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3}, + {n7, loc4}, {n8, loc4} + ], + location_claim_tester(n1, loc1, JoiningNodes, 64), + location_claim_tester(n1, loc1, JoiningNodes, 128), + location_claim_tester(n1, loc1, JoiningNodes, 256), + location_claim_tester(n1, loc1, JoiningNodes, 512), + location_claim_tester(n1, loc1, JoiningNodes, 1024), + location_claim_tester(n1, loc1, JoiningNodes, 2048). + +location_seqclaim_t3_test() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3}, + {n7, loc4}, {n8, loc4}, + {n9, loc5}, {n10, loc5}, + {n11, loc6}, {n12, loc7}, {n13, loc8} + ], + location_claim_tester(n1, loc1, JoiningNodes, 64), + location_claim_tester(n1, loc1, JoiningNodes, 128), + location_claim_tester(n1, loc1, JoiningNodes, 256), + location_claim_tester(n1, loc1, JoiningNodes, 512), + location_claim_tester(n1, loc1, JoiningNodes, 1024), + location_claim_tester(n1, loc1, JoiningNodes, 2048). + +location_seqclaim_t4_test() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l1n5, loc1}, {l1n6, loc1}, {l1n7, loc1}, {l1n8, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, + {l2n5, loc2}, {l2n6, loc2}, {l2n7, loc2}, {l2n8, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, + {l3n5, loc3}, {l3n6, loc3}, {l3n7, loc3}, {l3n8, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, + {l4n5, loc4}, {l4n6, loc4}, {l4n7, loc4}, {l4n8, loc4}, + {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, {l5n4, loc5}, + {l5n5, loc5}, {l5n6, loc5}, {l5n7, loc5}, + {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, + {l6n5, loc6}, {l6n6, loc6}, {l6n7, loc6}, + {l7n1, loc7}, {l7n2, loc7}, {l7n3, loc7}], + location_claim_tester(l1n1, loc1, JoiningNodes, 128), + location_claim_tester(l1n1, loc1, JoiningNodes, 256), + location_claim_tester(l1n1, loc1, JoiningNodes, 512), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048). + +location_seqclaim_t5_test() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, + {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, + {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, + {l7n1, loc7}, {l7n2, loc7}], + location_claim_tester(l1n1, loc1, JoiningNodes, 128), + location_claim_tester(l1n1, loc1, JoiningNodes, 256), + location_claim_tester(l1n1, loc1, JoiningNodes, 512), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048). + +location_seqclaim_t6_test() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l1n5, loc1}, {l1n6, loc1}, {l1n7, loc1}, {l1n8, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, + {l2n5, loc2}, {l2n6, loc2}, {l2n7, loc2}, {l2n8, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, + {l3n5, loc3}, {l3n6, loc3}, {l3n7, loc3}, {l3n8, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, + {l4n5, loc4}, {l4n6, loc4}, {l4n7, loc4}, {l4n8, loc4}, + {l5n1, loc5}, {l5n2, loc5}, + {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, + {l6n5, loc6}, {l6n6, loc6}, {l6n7, loc6}, {l6n8, loc8}], + location_claim_tester(l1n1, loc1, JoiningNodes, 256), + location_claim_tester(l1n1, loc1, JoiningNodes, 512), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048). + +location_claim_tester(N1, N1Loc, NodeLocList, RingSize) -> + location_claim_tester( + N1, N1Loc, NodeLocList, RingSize, sequential_claim, 4). + +location_claim_tester( + N1, N1Loc, NodeLocList, RingSize, ClaimFun, TargetN) -> + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|NodeLocList], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RAll = + lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + NodeLocList + ), + Params = + case ClaimFun of + sequential_claim -> + TargetN; + choose_claim_v4 -> + [{target_n_val, 3}] + end, + RClaim = + riak_core_membership_claim:claim( + RAll, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, ClaimFun, Params}), + {RingSize, Mappings} = riak_core_ring:chash(RClaim), + + check_for_failures(Mappings, TargetN, RClaim). + + +check_for_failures(Mappings, TargetN, RClaim) -> + NLs = riak_core_ring:get_nodes_locations(RClaim), + LocationMap = + lists:map( + fun({Idx, N}) -> + {Idx, riak_core_location:get_node_location(N, NLs)} + end, + Mappings), + Prefix = lists:sublist(LocationMap, 3), + CheckableMap = LocationMap ++ Prefix, + {_, Failures} = + lists:foldl( + fun({Idx, L}, {LastNminus1, Fails}) -> + case lists:member(L, LastNminus1) of + false -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], Fails}; + true -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], + [{Idx, L, LastNminus1}|Fails]} + end + end, + {[], []}, + CheckableMap + ), + lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), + ?assert(length(Failures) == 0). + + +location_multistage_t1_test_() -> + {timeout, 60, fun location_multistage_t1_tester/0}. + +location_multistage_t2_test_() -> + {timeout, 60, fun location_multistage_t2_tester/0}. + +% location_multistage_t3_test_() -> +% {timeout, 60, fun location_multistage_t3_tester/0}. + +location_multistage_t4_test_() -> + {timeout, 60, fun location_multistage_t4_tester/0}. + +location_multistage_t1_tester() -> + %% This is a tricky corner case where we would fail to meet TargetN for + %% locations if joining all 9 nodes in one claim (as sequential_claim will + %% not succeed). However, If we join 8 nodes, then add the 9th, TargetN + %% is always achieved + JoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4} + ], + location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4). + +location_multistage_t2_tester() -> + %% This is a tricky corner case as with location_multistage_t1_tester/1, + %% but now, because the TargetN does not divide evenly by the ring size + %% only TargetN - 1 can be achieved for locations. + JoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3} + ], + location_multistage_claim_tester(64, JoiningNodes, 3, l4n7, loc4, 2), + location_multistage_claim_tester(128, JoiningNodes, 3, l4n7, loc4, 2), + location_multistage_claim_tester(256, JoiningNodes, 3, l4n7, loc4, 2), + location_multistage_claim_tester(512, JoiningNodes, 3, l4n7, loc4, 2), + location_multistage_claim_tester(1024, JoiningNodes, 3, l4n7, loc4, 2), + location_multistage_claim_tester(2048, JoiningNodes, 3, l4n7, loc4, 2). + +% location_multistage_t3_tester() -> +% %% This is a minimal case for having TargetN locations, and an uneven +% %% Alloctaion around the locations. Is TargetN - 1 still held up +% JoiningNodes = +% [{l1n2, loc1}, +% {l2n3, loc2}, {l2n6, loc2}, +% {l3n4, loc3}, +% {l4n5, loc4} +% ], +% location_multistage_claim_tester(64, JoiningNodes, 4, l3n7, loc3, 3), +% location_multistage_claim_tester(128, JoiningNodes, 4, l3n7, loc3, 3), +% location_multistage_claim_tester(256, JoiningNodes, 4, l3n7, loc3, 3), +% location_multistage_claim_tester(512, JoiningNodes, 4, l3n7, loc3, 3), +% location_multistage_claim_tester(1024, JoiningNodes, 4, l3n7, loc3, 3), +% location_multistage_claim_tester(2048, JoiningNodes, 4, l3n7, loc3, 3). + +location_multistage_t4_tester() -> + JoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4}, + {l5n9, loc5} + ], + + location_multistage_claim_tester(64, JoiningNodes, 4, l5n10, loc5, 4). + % location_multistage_claim_tester(128, JoiningNodes, 4, l5n10, loc5, 4), + % location_multistage_claim_tester(256, JoiningNodes, 4, l5n10, loc5, 4), + % location_multistage_claim_tester(512, JoiningNodes, 4, l5n10, loc5, 4), + % location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4), + % location_multistage_claim_tester(2048, JoiningNodes, 4, l5n10, loc5, 4). + +location_multistage_claim_tester( + RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> + SW0 = os:timestamp(), + N1 = l1n1, + N1Loc = loc1, + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|JoiningNodes], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RAll = + lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + JoiningNodes + ), + Params = [{target_n_val, TargetN}], + SW1 = os:timestamp(), + RClaimInit = + riak_core_membership_claim:claim( + RAll, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Params}), + SW2 = os:timestamp(), + io:format("Reclaiming without committing~n"), + + RingExtendA = + riak_core_ring:set_node_location( + NewNode, + NewLocation, + riak_core_ring:add_member(N1, RClaimInit, NewNode)), + RClaimExtendA = + riak_core_membership_claim:claim( + RingExtendA, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Params}), + + io:format("Commit initial claim~n"), + SW3 = os:timestamp(), + + RClaimInitCommit = + riak_core_ring:increment_vclock( + node(), + riak_core_ring:clear_location_changed(RClaimInit)), + + io:format("Reclaiming following commit~n"), + SW4 = os:timestamp(), + + RingExtendB = + riak_core_ring:set_node_location( + NewNode, + NewLocation, + riak_core_ring:add_member(N1, RClaimInitCommit, NewNode)), + RClaimExtendB = + riak_core_membership_claim:claim( + RingExtendB, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Params}), + + {_RingSizeInit, MappingsInit} = riak_core_ring:chash(RClaimInit), + {RingSizeA, MappingsA} = riak_core_ring:chash(RClaimExtendA), + {RingSizeB, MappingsB} = riak_core_ring:chash(RClaimExtendB), + + SW5 = os:timestamp(), + + ?assert(RingSizeA == RingSizeB), + ?assert(MappingsA == MappingsB), + + io:format("Testing initial Mappings:~n~n~p~n", [MappingsInit]), + check_for_failures(MappingsInit, VerifyN, RClaimInit), + io:format("Testing secondary Mappings:~n~n~p~n", [MappingsB]), + check_for_failures(MappingsB, VerifyN, RClaimExtendB), + + SW6 = os:timestamp(), + io:format( + "Test for RingSize ~w had timings:" + "Setup ~w First Claim ~w Next Claim ~w Commit ~w Other Claims ~w Verify ~w~n", + [RingSize, + timer:now_diff(SW1, SW0) div 1000, + timer:now_diff(SW2, SW1) div 1000, + timer:now_diff(SW3, SW2) div 1000, + timer:now_diff(SW4, SW3) div 1000, + timer:now_diff(SW5, SW4) div 1000, + timer:now_diff(SW6, SW5) div 1000] + ). + +-endif. \ No newline at end of file diff --git a/src/riak_core_location.erl b/src/riak_core_location.erl index 6220b5019..854ba43ff 100644 --- a/src/riak_core_location.erl +++ b/src/riak_core_location.erl @@ -2,11 +2,15 @@ %% API -export([get_node_location/2, - has_location_set_in_cluster/1, - stripe_nodes_by_location/2, - check_ring/1, - check_ring/2, - check_ring/3]). + has_location_set_in_cluster/1, + stripe_nodes_by_location/2, + check_ring/1, + check_ring/2, + check_ring/3, + support_locations_claim/2, + get_location_owners/1, + get_location_nodes/2, + local_nodes/2]). -spec get_node_location(node(), dict:dict()) -> string() | undefined. get_node_location(Node, Locations) -> @@ -108,4 +112,54 @@ check_ring(Ring, Nval, MinimumNumberOfDistinctLocations, Locations) -> -spec get_unique_locations(list(), dict:dict()) -> list(). get_unique_locations(PrefLists, Locations) -> - lists:usort([get_node_location(Node, Locations) || {_, Node} <- PrefLists]). \ No newline at end of file + lists:usort([get_node_location(Node, Locations) || {_, Node} <- PrefLists]). + +%% @doc +%% Are there sufficient locations to support an attempt to cluster claim a +%% given nval. This will be validated before using a location aware claim +%% algorithm +-spec support_locations_claim( + riak_core_ring:riak_core_ring(), pos_integer()) -> boolean(). +support_locations_claim(Ring, TargetNVal) -> + Locations = riak_core_ring:get_nodes_locations(Ring), + case has_location_set_in_cluster(Locations) of + true -> + UniqueLocations = + lists:usort( + lists:map( + fun({_N, L}) -> L end, + dict:to_list(Locations))), + length(UniqueLocations) >= TargetNVal; + false -> + false + end. + +%% @doc +%% Find a mapping between Idx vales and the locations of the nodes which +%% currently own them +-spec get_location_owners( + riak_core_ring:riak_core_ring()) -> list({non_neg_integer(), atom()}). +get_location_owners(Ring) -> + Locations = riak_core_ring:get_nodes_locations(Ring), + lists:map( + fun({Idx, Node}) -> + {Idx, get_node_location(Node, Locations)} + end, + riak_core_ring:all_owners(Ring)). + +%% @doc +%% Find other nodes in the same location of this node +-spec local_nodes(riak_core_ring:riak_core_ring(), node()) -> list(node()). +local_nodes(Ring, Node) -> + Locations = riak_core_ring:get_nodes_locations(Ring), + ThisLocation = get_node_location(Node, Locations), + lists:filtermap( + fun({N, L}) -> + case {N, L} of + {N, ThisLocation} when N =/= Node -> + {true, N}; + _ -> + false + end + end, + dict:to_list(Locations)). \ No newline at end of file diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 7d45a8760..cc6c5ae04 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -55,15 +55,14 @@ -export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4]). -export([default_wants_claim/1, default_wants_claim/2, default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, - never_wants_claim/1, never_wants_claim/2, - random_choose_claim/1, random_choose_claim/2, random_choose_claim/3]). + default_choose_params/0, default_choose_params/1]). -export([wants_claim_v2/1, wants_claim_v2/2, wants_claim_v3/1, wants_claim_v3/2, choose_claim_v2/1, choose_claim_v2/2, choose_claim_v2/3, choose_claim_v3/1, choose_claim_v3/2, choose_claim_v3/3, claim_rebalance_n/2, claim_diversify/3, claim_diagonal/3, wants/1, wants_owns_diff/2, meets_target_n/2, diagonal_stripe/2, - sequential_claim/2, get_counts/2]). + sequential_claim/2, get_counts/2, spaced_by_n/4, get_target_n/0]). -ifdef(TEST). -compile(export_all). @@ -76,6 +75,7 @@ eqc_check/2, prop_claim_ensures_unique_nodes_v2/0, % prop_claim_ensures_unique_nodes_v3/0, + prop_claim_ensures_unique_nodes_v4/0, prop_take_idxs/0 ]). -include_lib("eqc/include/eqc.hrl"). @@ -106,7 +106,7 @@ claim(Ring) -> choose_claim_v3 -> {riak_core_memberhsip_claim, choose_claim_v3}; choose_claim_v4 -> - {riak_core_memberhsip_claim, choose_claim_v4}; + {riak_core_claim_location, choose_claim_v4}; {CMod, CFun} -> {CMod, CFun} end, @@ -232,12 +232,15 @@ default_choose_params() -> default_choose_params(Params) -> case proplists:get_value(target_n_val, Params) of undefined -> - TN = app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N), + TN = get_target_n(), [{target_n_val, TN} | Params]; _-> Params end. +get_target_n() -> + app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N). + %% =================================================================== %% Claim Function Implementations %% =================================================================== @@ -537,8 +540,8 @@ choose_claim_v3(Ring) -> choose_claim_v3(Ring, node()). choose_claim_v3(Ring, ClaimNode) -> - Params = [{target_n_val, app_helper:get_env(riak_core, target_n_val, - ?DEF_TARGET_N)}], + Params = + [{target_n_val, get_target_n()}], choose_claim_v3(Ring, ClaimNode, Params). choose_claim_v3(Ring, _ClaimNode, Params) -> @@ -594,7 +597,7 @@ choose_claim_v3(Ring, _ClaimNode, Params) -> %% Lower diversity score is better, 0 if nodes are perfectly diverse. %% claim_v3(Wants, Owners, Params) -> - TN = proplists:get_value(target_n_val, Params, ?DEF_TARGET_N), + TN = proplists:get_value(target_n_val, Params, get_target_n()), Q = length(Owners), Claiming = [N || {N,W} <- Wants, W > 0], Trials = proplists:get_value(trials, Params, 100), @@ -660,7 +663,7 @@ claim_diagonal(Wants, Owners, Params) -> {lists:flatten([lists:duplicate(Reps, Claiming), Last]), [diagonalized]}. sequential_claim(Ring, Node) -> - TN = app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N), + TN = get_target_n(), sequential_claim(Ring, Node, TN). %% @private fall back to diagonal striping vnodes across nodes in a @@ -784,28 +787,10 @@ diagonal_stripe(Ring, Nodes) -> 1, length(Partitions))), Zipped. -random_choose_claim(Ring) -> - random_choose_claim(Ring, node()). - -random_choose_claim(Ring, Node) -> - random_choose_claim(Ring, Node, []). - -random_choose_claim(Ring0, Node, _Params) -> - Ring = riak_core_ring:upgrade(Ring0), - riak_core_ring:transfer_node(riak_core_ring:random_other_index(Ring), - Node, Ring). - -%% @spec never_wants_claim(riak_core_ring()) -> no -%% @doc For use by nodes that should not claim any partitions. -never_wants_claim(_) -> no. -never_wants_claim(_,_) -> no. - %% =================================================================== %% Private %% =================================================================== - - %% @private %% %% @doc Determines indices that violate the given target_n spacing @@ -833,14 +818,17 @@ find_violations(Ring, TargetN) -> [{node(), pos_integer()}]. get_counts(Nodes, PartitionOwners) -> Empty = [{Node, 0} || Node <- Nodes], - Counts = lists:foldl(fun({_Idx, Node}, Counts) -> - case lists:member(Node, Nodes) of - true -> - dict:update_counter(Node, 1, Counts); - false -> - Counts - end - end, dict:from_list(Empty), PartitionOwners), + Counts = + lists:foldl( + fun({_Idx, Node}, Counts) -> + case lists:member(Node, Nodes) of + true -> + dict:update_counter(Node, 1, Counts); + false -> + Counts + end + end, + dict:from_list(Empty), PartitionOwners), dict:to_list(Counts). %% @private @@ -904,8 +892,6 @@ select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> Indices), lists:reverse(Claim). -%% @private -%% %% @doc Determine if two positions in the ring meet target_n spacing. spaced_by_n(NthA, NthB, TargetN, RingSize) -> case NthA > NthB of @@ -1330,6 +1316,19 @@ prop_claim_ensures_unique_nodes_v2() -> % prop_claim_ensures_unique_nodes_v3() -> % prop_claim_ensures_unique_nodes(choose_claim_v3). +prop_claim_ensures_unique_nodes_v4() -> + prop_claim_ensures_unique_nodes(choose_claim_v4). + +choose_claim_v4(Ring) -> + choose_claim_v4(Ring, node()). + +choose_claim_v4(Ring, Node) -> + Params = default_choose_params(), + choose_claim_v4(Ring, Node, Params). + +choose_claim_v4(Ring, Node, Params) -> + riak_core_claim_location:choose_claim_v4(Ring, Node, Params). + %% NOTE: this is a less than adequate test that has been re-instated %% so that we don't leave the code worse than we found it. Work that %% fixed claim_v2's tail violations and vnode balance issues did not From 35b9e2f69f75c73ca621614b743972bce7571833 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 29 Mar 2023 19:20:12 +0100 Subject: [PATCH 05/30] Location claim improvements Location claim improved so that it will try to balance the spread of vnodes, if it reaches the end and is still unbalanced. Also uses a stronger meets_target_n to fallback to sequential_claim more reliably on incorrect spacing (of vnodes across nodes, but not yet across locations). --- src/riak_core_claim_location.erl | 492 +++++++++++++++++++++++------ src/riak_core_location.erl | 2 +- src/riak_core_membership_claim.erl | 4 +- 3 files changed, 401 insertions(+), 97 deletions(-) diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl index ce4bf1848..f901e5716 100644 --- a/src/riak_core_claim_location.erl +++ b/src/riak_core_claim_location.erl @@ -30,6 +30,11 @@ sort_members_for_choose/3 ]). +-spec sort_members_for_choose( + riak_core_ring:riak_core_ring(), + list(node()), + list({non_neg_integer(), node()})) -> + list({non_neg_integer(), node()}). sort_members_for_choose(Ring, Members, Owners) -> NodesLocations = riak_core_ring:get_nodes_locations(Ring), case riak_core_location:has_location_set_in_cluster(NodesLocations) of @@ -73,13 +78,16 @@ choose_claim_v4(Ring, Node) -> Params = riak_core_membership_claim:default_choose_params(), choose_claim_v4(Ring, Node, Params). +-spec choose_claim_v4( + riak_core_ring:riak_core_ring(), node(), list(tuple())) -> + riak_core_ring:riak_core_ring(). choose_claim_v4(Ring, Node, Params0) -> Params = riak_core_membership_claim:default_choose_params(Params0), Active = riak_core_ring:claiming_members(Ring), Owners = riak_core_ring:all_owners(Ring), Ownerships = riak_core_membership_claim:get_counts(Active, Owners), RingSize = riak_core_ring:num_partitions(Ring), - NodeCount = erlang:length(Active), + NodeCount = length(Active), {MinVnodes, MaxVnodes, Deltas} = assess_deltas(RingSize, NodeCount, Ownerships), {Node, CurrentOwnerships} = @@ -104,94 +112,42 @@ choose_claim_v4(Ring, Node, Params0) -> [Idx || {Idx, _} <- Owners] ), AllIndices = - case NodesAllClaimed of - true -> - ZippedIndices; - false -> - StripeCount = max(1, (length(Active) - 1)), - StripeList = - lists:map( - fun({Nth, I}) -> {Nth rem StripeCount, Nth, I} end, - ZippedIndices), - Counter = - dict:from_list( - lists:map( - fun(I) -> {I, 0} end, - lists:seq(0, StripeCount - 1)) - ), - Counted = - lists:foldl( - fun({R, _Nth, _I}, C) -> - dict:update_counter(R, 1, C) - end, - Counter, - StripeList), - lists:map( - fun({_OD, _RC, _R, Nth, I}) -> {Nth, I} end, - lists:sort( - lists:map( - fun({R, Nth, I}) -> - {I, Owner} = lists:keyfind(I, 1, Owners), - {Owner, Delta} = lists:keyfind(Owner, 1, Deltas), - {Delta, dict:fetch(R, Counted), R, Nth, I} - end, - lists:reverse(StripeList) - ) - ) - ) - end, + sort_indices_for_claim( + ZippedIndices, length(Active), Owners, Deltas, NodesAllClaimed), EnoughNodes = (NodeCount > TargetN) or ((NodeCount == TargetN) and (RingSize rem TargetN =:= 0)), - case EnoughNodes of - true -> - %% If we have enough nodes to meet target_n, then we prefer to - %% claim indices that are currently causing violations, and then - %% fallback to indices in linear order. The filtering steps below - %% will ensure no new violations are introduced. - NodeViolations = find_node_violations(Ring, TargetN), - LocationViolations = - lists:subtract( - find_location_violations(Ring, TargetN), NodeViolations), - {DirtyNodeIndices, OtherIndices} = - lists:splitwith( - fun({_Nth, Idx}) -> - lists:member(Idx, NodeViolations) - end, - AllIndices), - {DirtyLocationIndices, CleanIndices} = - lists:splitwith( - fun({_Nth, Idx}) -> - lists:member(Idx, LocationViolations) - end, - OtherIndices - ), - Indices = DirtyNodeIndices ++ DirtyLocationIndices ++ CleanIndices; - false -> - %% If we do not have enough nodes to meet target_n, then we prefer - %% claiming the same indices that would occur during a - %% re-diagonalization of the ring with target_n nodes, falling - %% back to linear offsets off these preferred indices when the - %% number of indices desired is less than the computed set. - Padding = lists:duplicate(TargetN, undefined), - Expanded = lists:sublist(Active ++ Padding, TargetN), - ExpandedLocation = get_nodes_by_location(Expanded, Ring), - PreferredClaim = - riak_core_membership_claim:diagonal_stripe( - Ring, ExpandedLocation), - PreferredNth = [begin - {Nth, Idx} = lists:keyfind(Idx, 2, AllIndices), - Nth - end || {Idx,Owner} <- PreferredClaim, - Owner =:= Node], - Offsets = lists:seq(0, RingSize div length(PreferredNth)), - AllNth = lists:sublist([(X+Y) rem RingSize || Y <- Offsets, - X <- PreferredNth], - RingSize), - Indices = [lists:keyfind(Nth, 1, AllIndices) || Nth <- AllNth] - end, + Indices = + case EnoughNodes of + true -> + %% If we have enough nodes to meet target_n, then we prefer to + %% claim indices that are currently causing violations, and + %% then fallback to indices in linear order. The filtering + %% steps below will ensure no new violations are introduced. + NodeViolations = find_node_violations(Ring, TargetN), + LocationViolations = + lists:subtract( + find_location_violations(Ring, TargetN), + NodeViolations), + {DirtyNodeIndices, OtherIndices} = + lists:splitwith( + fun({_Nth, Idx}) -> + lists:member(Idx, NodeViolations) + end, + AllIndices), + {DirtyLocationIndices, CleanIndices} = + lists:splitwith( + fun({_Nth, Idx}) -> + lists:member(Idx, LocationViolations) + end, + OtherIndices + ), + DirtyNodeIndices ++ DirtyLocationIndices ++ CleanIndices; + false -> + AllIndices + end, %% Filter out indices that conflict with the node's existing ownership ClaimableIdxs = @@ -215,25 +171,179 @@ choose_claim_v4(Ring, Node, Params0) -> Ring, Claim2), - BadRing = - riak_core_membership_claim:meets_target_n(NewRing, TargetN) == false, + BadRing = length(meets_target_n(NewRing, TargetN)) > 0, DeficientClaim = (length(Claim2) + CurrentOwnerships) < MinVnodes, BadClaim = EnoughNodes and BadRing and NodesAllClaimed, + + MaybeBalancedRing = + case NodesAllClaimed and (MinVnodes < MaxVnodes) of + true -> + NewOwners = riak_core_ring:all_owners(NewRing), + NewOwnerships = + riak_core_membership_claim:get_counts(Active, NewOwners), + {MinVnodes, MaxVnodes, NewDeltas} + = assess_deltas(RingSize, NodeCount, NewOwnerships), + NodesToGive = + lists:filter( + fun({_N, D}) -> + case D of + D when D < (MinVnodes - MaxVnodes) -> + true; + _ -> + false + end + end, + NewDeltas), + NodesToTake = + lists:filtermap( + fun({N, D}) -> + case D of 0 -> {true, N}; _ -> false end + end, + NewDeltas), + give_partitions( + NodesToGive, NodesToTake, ZippedIndices, TargetN, NewRing); + false -> + NewRing + end, case BadClaim or DeficientClaim of true -> - %% Unable to claim, fallback to re-diagonalization sequential_claim(Ring, Node, TargetN); _ -> - NewRing + MaybeBalancedRing + end. + +-spec give_partitions( + list({node(), integer()}), + list(node()), + list({non_neg_integer(), non_neg_integer()}), + pos_integer(), + riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). +give_partitions([], _TakeNodes, _ZipIndices, _TargetN, Ring) -> + Ring; +give_partitions(_, [], _ZipIndices, _TargetN, Ring) -> + Ring; +give_partitions([{_Node, -1}|Rest], TakeNodes, ZipIndices, TargetN, Ring) -> + give_partitions(Rest, TakeNodes, ZipIndices, TargetN, Ring); +give_partitions([{Node, D}|Rest], TakeNodes, ZipIndices, TargetN, Ring) -> + Owners = riak_core_ring:all_owners(Ring), + Partitions = + lists:filtermap( + fun({Idx, N}) -> case N of Node -> {true, Idx}; _ -> false end end, + Owners), + {Success, ClaimableIdx, ReceivingNode} = + lists:foldl( + fun (_Idx, {true, P, RcvNode}) -> + {true, P, RcvNode}; + (Idx, {false, undefined, undefined}) -> + PotentialHomes = + find_home( + Idx, TakeNodes, ZipIndices, TargetN, Owners, Ring), + case PotentialHomes of + [] -> + {false, undefined, undefined}; + [HN|_Rest] -> + {true, Idx, HN} + end + end, + {false, undefined, undefined}, + Partitions), + case {Success, ClaimableIdx, ReceivingNode} of + {true, ClaimableIdx, ReceivingNode} -> + give_partitions( + [{Node, D + 1}|Rest], + TakeNodes -- [ReceivingNode], + ZipIndices, + TargetN, + riak_core_ring:transfer_node( + ClaimableIdx, ReceivingNode, Ring)); + {false, undefined, undefined} -> + give_partitions(Rest, TakeNodes, ZipIndices, TargetN, Ring) end. +find_home(Idx, TakeNodes, ZippedIndices, TargetN, Owners, Ring) -> + {Nth, Idx} = lists:keyfind(Idx, 2, ZippedIndices), + RS = length(ZippedIndices), + OwningNodes = + lists:usort( + lists:map( + fun(N0) -> + N1 = + case N0 of + N0 when N0 < 0 -> RS + N0; + N0 when N0 >= RS -> N0 - RS; + N0 -> N0 + end, + {N1, I} = lists:keyfind(N1, 1, ZippedIndices), + {I, O} = lists:keyfind(I, 1, Owners), + O + end, + lists:seq((Nth + 1) - TargetN, Nth + TargetN - 1) -- [Nth]) + ), + NodesLocations = riak_core_ring:get_nodes_locations(Ring), + case riak_core_location:has_location_set_in_cluster(NodesLocations) of + false -> + TakeNodes -- OwningNodes; + true -> + Locations = + lists:usort( + lists:map( + fun(N) -> + riak_core_location:get_node_location( + N, NodesLocations) + end, + OwningNodes)), + lists:filter( + fun(TN0) -> + not lists:member( + riak_core_location:get_node_location( + TN0, NodesLocations), + Locations) + end, + TakeNodes) + end. + + +-spec sort_indices_for_claim( + list({non_neg_integer(), non_neg_integer()}), + pos_integer(), + [{non_neg_integer(), node()}], + [{node(), integer()}], + boolean()) -> list({non_neg_integer(), non_neg_integer()}). +sort_indices_for_claim( + ZippedIndices, ActiveMemberCount, Owners, Deltas, _NodesAllClaimed) -> + StripeCount = max(1, (ActiveMemberCount - 1)), + StripeList = + lists:map( + fun({Nth, I}) -> {Nth rem StripeCount, Nth, I} end, + ZippedIndices), + Counter = + dict:from_list( + lists:map(fun(I) -> {I, 0} end, lists:seq(0, StripeCount - 1))), + Counted = + lists:foldl( + fun({R, _Nth, _I}, C) -> dict:update_counter(R, 1, C) end, + Counter, + StripeList), + lists:map( + fun({_OD, _RC, _R, Nth, I}) -> {Nth, I} end, + lists:sort( + lists:map( + fun({R, Nth, I}) -> + {I, Owner} = lists:keyfind(I, 1, Owners), + {Owner, Delta} = lists:keyfind(Owner, 1, Deltas), + {Delta, dict:fetch(R, Counted), R, Nth, I} + end, + lists:reverse(StripeList) + ))). + %% @doc -%% Assess what the mnimum and maximum number of vnodes which should be owned by -%% each node, and return a list of nodes with the Deltas from the minimum i.e. -%% where a node has more vnodes than the minimum the delta will be a negative -%% number indicating the number of vnodes it can offer to a node with wants. +%% Assess what the minimum and maximum number of vnodes which should be owned +%% by each node, and return a list of nodes with the Deltas from the minimum +%% i.e. where a node has more vnodes than the minimum the delta will be a +%% negative number indicating the number of vnodes it can offer to a node with +%% wants. -spec assess_deltas( pos_integer(), pos_integer(), [{node(), non_neg_integer()}]) -> {non_neg_integer(), pos_integer(), [{node(), integer()}]}. @@ -323,6 +433,31 @@ safe_indices( IndicesToCheck ). + +-spec meets_target_n( + riak_core_ring:riak_core_ring(), pos_integer()) -> + list({non_neg_integer(), node(), list(node())}). +meets_target_n(Ring, TargetN) when TargetN > 1 -> + {_RingSize, Mappings} = riak_core_ring:chash(Ring), + Prefix = lists:sublist(Mappings, TargetN - 1), + CheckableMap = Mappings ++ Prefix, + {_, Failures} = + lists:foldl( + fun({Idx, N}, {LastNminus1, Fails}) -> + case lists:member(N, LastNminus1) of + false -> + {[N|lists:sublist(LastNminus1, TargetN - 2)], Fails}; + true -> + {[N|lists:sublist(LastNminus1, TargetN - 2)], + [{Idx, N, LastNminus1}|Fails]} + end + end, + {[], []}, + CheckableMap), + Failures; +meets_target_n(_Ring, _TargetN) -> + true. + %% @private %% %% @doc Select indices from a given candidate set, according to two @@ -428,8 +563,7 @@ sequential_claim(Ring, Node) -> -spec sequential_claim( riak_core_ring:riak_core_ring(), node(), integer()) -> riak_core_ring:riak_core_ring(). -sequential_claim(Ring0, Node, TargetN) -> - Ring = riak_core_ring:upgrade(Ring0), +sequential_claim(Ring, Node, TargetN) -> OrigNodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), Nodes = get_nodes_by_location(OrigNodes, Ring), NodeCount = length(Nodes), @@ -628,6 +762,27 @@ sort_lists_by_length(ListOfLists) -> -include_lib("eunit/include/eunit.hrl"). +simple_cluster_test() -> + RingSize = 32, + TargetN = 4, + NodeList = [n1, n2, n3, n4, n5, n6], + R0 = riak_core_ring:fresh(RingSize, n1), + R1 = + lists:foldl( + fun(N, AccR) -> riak_core_ring:add_member(n1, AccR, N) end, + R0, + NodeList -- [n1]), + Props = [{target_n_val, TargetN}], + RClaim = + riak_core_membership_claim:claim( + R1, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Props}), + Failures = meets_target_n(RClaim, TargetN), + lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), + ?assert(length(Failures) == 0). + + prefilter_violations_test_() -> % Be strict on test timeout. Unrefined code took > 10s, whereas the % refactored code should be << 1s. @@ -1047,4 +1202,153 @@ location_multistage_claim_tester( timer:now_diff(SW6, SW5) div 1000] ). +location_typical_expansion_test() -> + location_typical_expansion_tester(256) + %, + % location_typical_expansion_tester(512) + . + +location_typical_expansion_tester(RingSize) -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 4, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4}], + + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|InitJoiningNodes], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RAll = + lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + InitJoiningNodes + ), + Params = [{target_n_val, TargetN}], + RClaimInit = + riak_core_membership_claim:claim( + RAll, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Params}), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + + check_for_failures(MappingsInit, TargetN, RClaimInit), + + Stage1Ring = + lists:foldl( + fun(JN, R) -> + riak_core_ring:set_member(node(), R, JN, valid, same_vclock) + end, + RClaimInit, + riak_core_ring:members(RClaimInit, [joining]) + ), + + RClaimStage2 = add_node(Stage1Ring, N1, l5n9, loc5, Params), + {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), + check_for_failures(Mappings2, TargetN, RClaimStage2), + Stage2Ring = commit_change(RClaimStage2), + + RClaimStage3 = add_node(Stage2Ring, N1, l5n10, loc5, Params), + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), + check_for_failures(Mappings3, TargetN, RClaimStage3), + Stage3Ring = commit_change(RClaimStage3), + + RClaimStage4 = add_node(Stage3Ring, N1, l6n11, loc6, Params), + {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage4), + check_for_failures(Mappings4, TargetN, RClaimStage4), + Stage4Ring = commit_change(RClaimStage4), + + RClaimStage5 = add_node(Stage4Ring, N1, l6n12, loc6, Params), + {RingSize, Mappings5} = riak_core_ring:chash(RClaimStage5), + check_for_failures(Mappings5, TargetN, RClaimStage5), + Stage5Ring = commit_change(RClaimStage5), + + RClaimStage6 = add_node(Stage5Ring, N1, l1n13, loc1, Params), + {RingSize, Mappings6} = riak_core_ring:chash(RClaimStage6), + check_for_failures(Mappings6, TargetN, RClaimStage6), + Stage6Ring = commit_change(RClaimStage6), + + RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), + {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), + check_for_failures(Mappings7, TargetN, RClaimStage7), + _Stage7Ring = commit_change(RClaimStage7) + + % , + % RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), + % {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), + % check_for_failures(Mappings8, TargetN, RClaimStage8), + % Stage8Ring = commit_change(RClaimStage8), + + % RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), + % {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), + % check_for_failures(Mappings9, TargetN, RClaimStage9), + % _Stage9Ring = commit_change(RClaimStage9) + . + + +add_node(Ring, Claimant, Node, Location, Params) -> + RingA = riak_core_ring:add_member(Claimant, Ring, Node), + RingB = riak_core_ring:set_node_location(Node, Location, RingA), + RingC = + riak_core_membership_claim:claim( + RingB, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_claim_location, choose_claim_v4, Params}), + OwnersPre = riak_core_ring:all_owners(RingA), + OwnersPost = riak_core_ring:all_owners(RingC), + OwnersZip = lists:zip(OwnersPre, OwnersPost), + Next = + [{Idx, PrevOwner, NewOwner, [], awaiting} || + {{Idx, PrevOwner}, {Idx, NewOwner}} <- OwnersZip, + PrevOwner /= NewOwner], + % StartingNodes = riak_core_ring:all_members(Ring), + % ExpectedTransferMax = 2 * (length(OwnersPre) div length(StartingNodes)), + NodeCountD = + lists:foldl( + fun({_Idx, N}, D) -> + dict:update_counter(N, 1, D) + end, + dict:new(), + OwnersPost + ), + NodeCounts = + lists:map(fun({_N, C}) -> C end, dict:to_list(NodeCountD)), + io:format( + % user, + "NodeCounts~w~n", + [dict:to_list(NodeCountD)]), + io:format( + % user, + "Adding node ~w in location ~w - ~w transfers ~w max ~w min vnodes~n", + [Node, Location, + length(Next), lists:max(NodeCounts), lists:min(NodeCounts)]), + ?assert( + (lists:min(NodeCounts) == (lists:max(NodeCounts) - 1)) or + (lists:min(NodeCounts) == lists:max(NodeCounts)) + ), + % ?assert(length(Next) =< ExpectedTransferMax), + RingC. + +commit_change(Ring) -> + lists:foldl( + fun(JN, R) -> + riak_core_ring:set_member(node(), R, JN, valid, same_vclock) + end, + Ring, + riak_core_ring:members(Ring, [joining]) + ). + -endif. \ No newline at end of file diff --git a/src/riak_core_location.erl b/src/riak_core_location.erl index 854ba43ff..ba4478564 100644 --- a/src/riak_core_location.erl +++ b/src/riak_core_location.erl @@ -135,7 +135,7 @@ support_locations_claim(Ring, TargetNVal) -> end. %% @doc -%% Find a mapping between Idx vales and the locations of the nodes which +%% Find a mapping between Idx ids and the locations of the nodes which %% currently own them -spec get_location_owners( riak_core_ring:riak_core_ring()) -> list({non_neg_integer(), atom()}). diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index cc6c5ae04..1f6298337 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -102,9 +102,9 @@ claim(Ring) -> Choose = case app_helper:get_env(riak_core, choose_claim_fun) of choose_claim_v2 -> - {riak_core_memberhsip_claim, choose_claim_v2}; + {riak_core_membership_claim, choose_claim_v2}; choose_claim_v3 -> - {riak_core_memberhsip_claim, choose_claim_v3}; + {riak_core_membership_claim, choose_claim_v3}; choose_claim_v4 -> {riak_core_claim_location, choose_claim_v4}; {CMod, CFun} -> From 4e2c9630b36cd0d1522ccd6b0464859397cb5782 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 31 Mar 2023 00:25:14 +0100 Subject: [PATCH 06/30] Refinements to claim_v4 Extended potential for test by determining what nodes are safe to both add and remove from loops, rather than simply relying on sequential order. --- src/riak_core_claim_location.erl | 360 +++++++++++++++++++++++++++---- 1 file changed, 319 insertions(+), 41 deletions(-) diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl index f901e5716..98fb5c8f2 100644 --- a/src/riak_core_claim_location.erl +++ b/src/riak_core_claim_location.erl @@ -30,6 +30,8 @@ sort_members_for_choose/3 ]). +-type location_finder() :: fun((node()) -> atom()). + -spec sort_members_for_choose( riak_core_ring:riak_core_ring(), list(node()), @@ -580,7 +582,7 @@ sequential_claim(Ring, Node, TargetN) -> riak_core_location:support_locations_claim(Ring, TargetN), {SolveableLocationViolation, LocationShortfall} = case {LocationsSupported, Overhang, RingSize div NodeCount} of - {true, OH, Loops} when OH > 0, OH > TargetN, Loops > 1 -> + {true, OH, Loops} when OH > 0, Loops > 1 -> MinDistance = check_for_location_tail_violation( Nodes, Ring, OH, TargetN), @@ -593,7 +595,7 @@ sequential_claim(Ring, Node, TargetN) -> _ -> {false, 0} end; - _ -> + LInfo -> {false, 0} end, @@ -601,12 +603,15 @@ sequential_claim(Ring, Node, TargetN) -> Zipped = case {SolveableLocationViolation, SolveableNodeViolation} of {true, _} -> + F = location_finder(Ring), Nodelist = - solve_tail_violations(RingSize, Nodes, LocationShortfall), + solve_tail_violations( + RingSize, Nodes, LocationShortfall, TargetN, true, F), lists:zip(Partitions, Nodelist); {_, true} -> Nodelist = - solve_tail_violations(RingSize, Nodes, Shortfall), + solve_tail_violations( + RingSize, Nodes, Shortfall, TargetN, false, undefined), lists:zip(Partitions, Nodelist); _ -> riak_core_membership_claim:diagonal_stripe(Ring, Nodes) @@ -617,6 +622,12 @@ sequential_claim(Ring, Node, TargetN) -> Ring, Zipped). +-spec location_finder(riak_core_ring:riak_core_ring()) -> location_finder(). +location_finder(Ring) -> + LocationD = riak_core_ring:get_nodes_locations(Ring), + fun(N) -> + riak_core_location:get_node_location(N, LocationD) + end. -spec check_for_location_tail_violation( list(node()), @@ -624,16 +635,18 @@ sequential_claim(Ring, Node, TargetN) -> pos_integer(), pos_integer()) -> pos_integer(). check_for_location_tail_violation(Nodes, Ring, OH, TargetN) -> - LastNodes = lists:sublist(Nodes, 1 + OH - TargetN, TargetN), - FirstNodes = lists:sublist(Nodes, TargetN), - LocationD = riak_core_ring:get_nodes_locations(Ring), - LocationFinder = - fun(N) -> riak_core_location:get_node_location(N, LocationD) end, + {LastLoop, ExtraNodes} = lists:split(OH, Nodes), + LastNodes = + lists:reverse( + lists:sublist( + lists:reverse(ExtraNodes ++ LastLoop), TargetN - 1)), + FirstNodes = lists:sublist(Nodes, TargetN - 1), + LocationFinder = location_finder(Ring), LastLocations = lists:map(LocationFinder, LastNodes), FirstLocations = lists:zip( lists:map(LocationFinder, FirstNodes), - lists:seq(0, TargetN - 1)), + lists:seq(1, TargetN - 1)), {MinDistance, _} = lists:foldl( fun(L, {MinStep, TailStep}) -> @@ -644,7 +657,7 @@ check_for_location_tail_violation(Nodes, Ring, OH, TargetN) -> {MinStep, TailStep - 1} end end, - {TargetN, TargetN - 1}, + {TargetN, TargetN - 2}, LastLocations), MinDistance. @@ -662,13 +675,20 @@ solveable_violation(RingSize, NodeCount, TargetN, Shortfall) -> end. %% @doc -%% The node list mosut be of length ring size. It is made up of a set of +%% The node list must be of length ring size. It is made up of a set of %% complete loops of the node list, and then a partial loop with the addition %% of the shortfall. The for each node in the shortfall a node in the complete %% loops must be removed -spec solve_tail_violations( - pos_integer(), [node()], non_neg_integer()) -> [[node()]]. -solve_tail_violations(RingSize, Nodes, Shortfall) -> + pos_integer(), + [node()], + non_neg_integer(), + pos_integer(), + boolean(), + undefined|location_finder()) -> + [[node()]]. +solve_tail_violations( + RingSize, Nodes, Shortfall, _TargetN, false, _LocFinder) -> {LastLoop, Remainder} = lists:split(RingSize rem length(Nodes), Nodes), ExcessLoop = lists:sublist(Remainder, Shortfall), @@ -683,7 +703,221 @@ solve_tail_violations(RingSize, Nodes, Shortfall) -> lists:map( fun(ENL) -> lists:subtract(Nodes, ENL) end, RemoveList), - CompleteLoops ++ lists:append(PartialLoops) ++ Tail. + CompleteLoops ++ lists:append(PartialLoops) ++ Tail; +solve_tail_violations( + RingSize, Nodes, Shortfall, TargetN, true, LocFinder) -> + {LastLoop, Remainder} = + lists:split(RingSize rem length(Nodes), Nodes), + PostLoop = lists:sublist(Nodes, TargetN - 1), + PreExcess = + lists:reverse( + lists:sublist( + lists:reverse(Nodes ++ LastLoop), TargetN - 1)), + {SafeList, SafeAdditions} = + case safe_to_remove(Nodes, LastLoop, TargetN, LocFinder) of + SL when length(SL) >= Shortfall -> + {lists:sublist(SL, Shortfall), Remainder}; + SL -> + RemovableExcess = + safe_to_remove( + Nodes, Remainder, TargetN, LocFinder), + {SL, RemovableExcess} + end, + ExcessLoop = + case length(SafeAdditions) of + NodesToCheck when NodesToCheck >= Shortfall -> + safe_to_add( + PreExcess, PostLoop, SafeAdditions, LocFinder, Shortfall); + NodesToCheck -> + CheckList = + SafeAdditions ++ + lists:sublist( + lists:subtract(Remainder, SafeAdditions), + Shortfall - NodesToCheck), + safe_to_add( + PreExcess, PostLoop, CheckList, LocFinder, Shortfall) + end, + + Tail = LastLoop ++ ExcessLoop, + LoopCount = RingSize div length(Nodes), + RemoveCount = length(ExcessLoop), + RemoveList = + divide_list_for_removes( + lists:sublist(SafeList ++ ExcessLoop, RemoveCount), LoopCount), + + case LoopCount > (2 * RemoveCount) of + true -> + PartialLoops = + lists:map( + fun(ENL) -> lists:subtract(Nodes, ENL) ++ Nodes end, + RemoveList), + CompleteLoops = + lists:flatten( + lists:duplicate(LoopCount - (2 * RemoveCount), Nodes)), + CompleteLoops ++ lists:append(PartialLoops) ++ Tail; + false -> + CompleteLoops = + lists:flatten( + lists:duplicate(LoopCount - RemoveCount, Nodes)), + PartialLoops = + lists:map( + fun(ENL) -> lists:subtract(Nodes, ENL) end, + RemoveList), + CompleteLoops ++ lists:append(PartialLoops) ++ Tail + end. + + +-spec safe_to_add( + list(node()), + list(node()), + list(node()), + location_finder()|undefined, + pos_integer()) -> list(node()). +safe_to_add(PreExcess, PostLoop, NodesToCheck, LocFinder, Shortfall) -> + NodePositions = + score_for_adding( + lists:zip( + lists:map(LocFinder, lists:reverse(PreExcess)), + lists:seq(1, length(PreExcess))), + lists:zip( + lists:map(LocFinder, PostLoop), + lists:seq(1, length(PostLoop))), + lists:map(LocFinder, NodesToCheck), + [], + Shortfall), + PositionsByNode = lists:zip(NodePositions, NodesToCheck), + Positions = lists:seq(1, Shortfall), + case choose_positions(Positions, PositionsByNode, [], {[], LocFinder}) of + fail -> + lists:sublist(NodesToCheck, Shortfall); + NodeList -> + lists:reverse(NodeList) + end. + +choose_positions([], _PositionsByNode, NodeList, _LocationCheck) -> + NodeList; +choose_positions([Pos|RestPos], PositionsByNode, NodeList, {LocList, LocF}) -> + SortedPositionsByNode = + lists:filter( + fun({PL, _N}) -> length(PL) > 0 end, + lists:sort(PositionsByNode)), + case SortedPositionsByNode of + [{TopPL, TopN}|RestPBN] -> + TopL = LocF(TopN), + case {lists:member(Pos, TopPL), lists:member(TopL, LocList)} of + {true, false} -> + choose_positions( + RestPos, + lists:map( + fun({PL, N}) -> {PL -- [Pos], N} end, + RestPBN), + [TopN|NodeList], + {[TopL|LocList], LocF}); + {true, true} -> + choose_positions( + [Pos|RestPos], + RestPBN, + NodeList, + {LocList, LocF}); + _ -> + fail + end; + _ -> + fail + end. + + +-spec score_for_adding( + list({node()|atom(), pos_integer()}), + list({node()|atom(), pos_integer()}), + list(node()|atom()), + list(list(pos_integer())), + pos_integer()) -> + list(list(pos_integer())). +score_for_adding(_PreExcess, _PostLoop, [], NodePositions, _Shortfall) -> + lists:reverse(NodePositions); +score_for_adding(PreExcess, PostLoop, [HD|Rest], NodePositions, Shortfall) -> + BackPositions = + case lists:keyfind(HD, 1, PreExcess) of + {HD, BS} -> + lists:filter( + fun(P) -> (P + BS - 1) > length(PreExcess) end, + lists:seq(1, Shortfall) + ); + false -> + lists:seq(1, Shortfall) + end, + ForwardPositions = + case lists:keyfind(HD, 1, PostLoop) of + {HD, FS} -> + lists:filter( + fun(P) -> (FS + Shortfall - P) > length(PostLoop) end, + lists:seq(1, Shortfall)); + false -> + lists:seq(1, Shortfall) + end, + SupportedPositions = + lists:filter( + fun(BP) -> lists:member(BP, ForwardPositions) end, BackPositions), + score_for_adding( + PreExcess, + PostLoop, + Rest, + [SupportedPositions|NodePositions], + Shortfall). + + +-spec safe_to_remove( + list(node()), + list(node()), + pos_integer(), + location_finder()|undefined) -> list(node()). +safe_to_remove(Nodes, NodesToCheck, TargetN, LocFinder) -> + LocationFinder = fun(N) -> {N, LocFinder(N)} end, + safe_to_remove_loop( + lists:map(LocationFinder, Nodes), + lists:map(LocationFinder, NodesToCheck), + [], + TargetN). + +safe_to_remove_loop(_Nodes, [], SafeList, _TargetN) -> + SafeList; +safe_to_remove_loop(Nodes, [HD|Rest], SafeList, TargetN) -> + WrappedNodes = (Nodes -- [HD]) ++ lists:sublist(Nodes, 1, TargetN), + {Node, _Location} = HD, + CheckFun = + fun({_N, L}, CheckList) -> + case lists:keyfind(L, 2, CheckList) of + false -> + false; + _ -> + true + end + end, + IsSafe = + lists:foldl( + fun(N, Acc) -> + case Acc of + fail -> + fail; + LastNminus1 when is_list(LastNminus1) -> + case CheckFun(N, LastNminus1) of + false -> + [N|lists:sublist(LastNminus1, TargetN - 2)]; + true -> + fail + end + end + end, + [], + WrappedNodes), + case IsSafe of + fail -> + safe_to_remove_loop(Nodes, Rest, SafeList, TargetN); + _ -> + safe_to_remove_loop(Nodes, Rest, [Node|SafeList], TargetN) + end. + %% @doc %% Normally need to remove one of the excess nodes each loop around the node @@ -712,8 +946,8 @@ divide_list_for_removes(Excess, LoopCount) -> %% @private %% Get active nodes ordered by taking location parameters into account --spec get_nodes_by_location([node()|undefined], riak_core_ring:riak_core_ring()) -> - [node()|undefined]. +-spec get_nodes_by_location( + [node()|undefined], riak_core_ring:riak_core_ring()) -> [node()|undefined]. get_nodes_by_location(Nodes, Ring) -> NodesLocations = riak_core_ring:get_nodes_locations(Ring), case riak_core_location:has_location_set_in_cluster(NodesLocations) of @@ -733,9 +967,9 @@ stripe_nodes_by_location(NodesByLocation) -> stripe_nodes_by_location(RestLNodes, lists:map(fun(N) -> [N] end, LNodes)). stripe_nodes_by_location([], Acc) -> - lists:flatten(Acc); + lists:flatten(lists:reverse(sort_lists_by_length(Acc))); stripe_nodes_by_location([LNodes|OtherLNodes], Acc) -> - SortedAcc = sort_lists_by_length(Acc), + SortedAcc = lists:reverse(sort_lists_by_length(Acc)), {UpdatedAcc, []} = lists:mapfoldl( fun(NodeList, LocationNodesToAdd) -> @@ -762,7 +996,30 @@ sort_lists_by_length(ListOfLists) -> -include_lib("eunit/include/eunit.hrl"). -simple_cluster_test() -> +choose_positions_test() -> + NodePositions = [{[1,2],l4n5},{[1,2],l5n5},{[1,2],l6n5},{[],l1n6}], + Positions = [1, 2], + LocF = fun(N) -> list_to_atom(lists:sublist(atom_to_list(N), 2)) end, + ?assertMatch( + [l4n5, l5n5], + lists:reverse( + choose_positions(Positions, NodePositions, [], {[], LocF}))). + + +score_for_adding_test() -> + PreExcess = [n2, n3, n4], + PostLoop = [n1, n2, n3], + PE = lists:zip(lists:reverse(PreExcess), lists:seq(1, length(PreExcess))), + PL = lists:zip(PostLoop, lists:seq(1, length(PostLoop))), + Candidates = [n1, n4, n5, n6, n7], + Shortfall = 4, + ExpectedResult = + [[1], [4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + ActualResult = + score_for_adding(PE, PL, Candidates, [], Shortfall), + ?assertMatch(ExpectedResult, ActualResult). + +simple_cluster_t1_test() -> RingSize = 32, TargetN = 4, NodeList = [n1, n2, n3, n4, n5, n6], @@ -781,7 +1038,19 @@ simple_cluster_test() -> Failures = meets_target_n(RClaim, TargetN), lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), ?assert(length(Failures) == 0). - + +sort_list_t1_test() -> + OtherLoc = + [[l2n1, l2n2], [l3n1, l3n2], [l4n1, l4n2], [l5n1, l5n2], + [l6n1], [l7n1], [l8n1]], + FirstLoc = [[l1n1], [l1n2]], + NodeList = stripe_nodes_by_location(OtherLoc, FirstLoc), + ExpectedNodeList = + [l1n1, l2n2, l3n1, l4n2, l5n1, l7n1, + l1n2, l2n1, l3n2, l4n1, l5n2, l6n1, l8n1], + ?assertMatch( + ExpectedNodeList, NodeList + ). prefilter_violations_test_() -> % Be strict on test timeout. Unrefined code took > 10s, whereas the @@ -960,6 +1229,19 @@ location_seqclaim_t6_test() -> location_claim_tester(l1n1, loc1, JoiningNodes, 1024), location_claim_tester(l1n1, loc1, JoiningNodes, 2048). +location_seqclaim_t7_test() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, + {l4n1, loc4}, {l4n2, loc4}, + {l5n1, loc5}, {l5n2, loc5}, + {l6n1, loc6}, {l6n2, loc6}], + location_claim_tester(l1n1, loc1, JoiningNodes, 256), + location_claim_tester(l1n1, loc1, JoiningNodes, 512), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048). + location_claim_tester(N1, N1Loc, NodeLocList, RingSize) -> location_claim_tester( N1, N1Loc, NodeLocList, RingSize, sequential_claim, 4). @@ -1101,11 +1383,11 @@ location_multistage_t4_tester() -> {l5n9, loc5} ], - location_multistage_claim_tester(64, JoiningNodes, 4, l5n10, loc5, 4). - % location_multistage_claim_tester(128, JoiningNodes, 4, l5n10, loc5, 4), - % location_multistage_claim_tester(256, JoiningNodes, 4, l5n10, loc5, 4), - % location_multistage_claim_tester(512, JoiningNodes, 4, l5n10, loc5, 4), - % location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(64, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(128, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(256, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(512, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4). %, % location_multistage_claim_tester(2048, JoiningNodes, 4, l5n10, loc5, 4). location_multistage_claim_tester( @@ -1203,10 +1485,8 @@ location_multistage_claim_tester( ). location_typical_expansion_test() -> - location_typical_expansion_tester(256) - %, - % location_typical_expansion_tester(512) - . + location_typical_expansion_tester(256), + location_typical_expansion_tester(512). location_typical_expansion_tester(RingSize) -> N1 = l1n1, @@ -1284,19 +1564,17 @@ location_typical_expansion_tester(RingSize) -> RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), check_for_failures(Mappings7, TargetN, RClaimStage7), - _Stage7Ring = commit_change(RClaimStage7) + Stage7Ring = commit_change(RClaimStage7), - % , - % RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), - % {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), - % check_for_failures(Mappings8, TargetN, RClaimStage8), - % Stage8Ring = commit_change(RClaimStage8), + RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), + {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), + check_for_failures(Mappings8, TargetN, RClaimStage8), + Stage8Ring = commit_change(RClaimStage8), - % RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), - % {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), - % check_for_failures(Mappings9, TargetN, RClaimStage9), - % _Stage9Ring = commit_change(RClaimStage9) - . + RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), + {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), + check_for_failures(Mappings9, TargetN, RClaimStage9), + _Stage9Ring = commit_change(RClaimStage9). add_node(Ring, Claimant, Node, Location, Params) -> From 3ab0f4bb7239a9a98751253f6bb2ad13e2d4ea4f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 31 Mar 2023 00:26:26 +0100 Subject: [PATCH 07/30] Correction following removal of log --- src/riak_core_claim_location.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl index 98fb5c8f2..0ae1d4cd9 100644 --- a/src/riak_core_claim_location.erl +++ b/src/riak_core_claim_location.erl @@ -595,7 +595,7 @@ sequential_claim(Ring, Node, TargetN) -> _ -> {false, 0} end; - LInfo -> + _ -> {false, 0} end, From 8fe7fc7e1f21c3c39e0f7d130dda2d0624d1a2ce Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 31 Mar 2023 08:28:10 +0100 Subject: [PATCH 08/30] Count remove list not Excess to determine loops --- src/riak_core_claim_location.erl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl index 0ae1d4cd9..71bc2fe79 100644 --- a/src/riak_core_claim_location.erl +++ b/src/riak_core_claim_location.erl @@ -744,8 +744,9 @@ solve_tail_violations( RemoveList = divide_list_for_removes( lists:sublist(SafeList ++ ExcessLoop, RemoveCount), LoopCount), + RemoveLoops = length(RemoveList), - case LoopCount > (2 * RemoveCount) of + case LoopCount > (2 * RemoveLoops) of true -> PartialLoops = lists:map( @@ -753,12 +754,12 @@ solve_tail_violations( RemoveList), CompleteLoops = lists:flatten( - lists:duplicate(LoopCount - (2 * RemoveCount), Nodes)), + lists:duplicate(LoopCount - (2 * RemoveLoops), Nodes)), CompleteLoops ++ lists:append(PartialLoops) ++ Tail; false -> CompleteLoops = lists:flatten( - lists:duplicate(LoopCount - RemoveCount, Nodes)), + lists:duplicate(LoopCount - RemoveLoops, Nodes)), PartialLoops = lists:map( fun(ENL) -> lists:subtract(Nodes, ENL) end, From 58d626487ac36f4e9f4053c31beed5d6e934fb79 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 31 Mar 2023 15:55:56 +0100 Subject: [PATCH 09/30] Better order of initial striping Resolves some test issues. Also try harder to do safe removals when looking back in the ring (as opposed to the removal other additions) --- src/riak_core_claim_location.erl | 92 +++++++++++++++++------------- src/riak_core_membership_claim.erl | 1 + 2 files changed, 54 insertions(+), 39 deletions(-) diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl index 71bc2fe79..c7af5dd89 100644 --- a/src/riak_core_claim_location.erl +++ b/src/riak_core_claim_location.erl @@ -582,7 +582,7 @@ sequential_claim(Ring, Node, TargetN) -> riak_core_location:support_locations_claim(Ring, TargetN), {SolveableLocationViolation, LocationShortfall} = case {LocationsSupported, Overhang, RingSize div NodeCount} of - {true, OH, Loops} when OH > 0, Loops > 1 -> + {true, OH, Loops} when OH > 0, Loops >= 1 -> MinDistance = check_for_location_tail_violation( Nodes, Ring, OH, TargetN), @@ -595,7 +595,7 @@ sequential_claim(Ring, Node, TargetN) -> _ -> {false, 0} end; - _ -> + _NotSolveable -> {false, 0} end, @@ -737,13 +737,19 @@ solve_tail_violations( safe_to_add( PreExcess, PostLoop, CheckList, LocFinder, Shortfall) end, - + Tail = LastLoop ++ ExcessLoop, LoopCount = RingSize div length(Nodes), RemoveCount = length(ExcessLoop), + UpdSafeList = + SafeList ++ + lists:filter( + fun(N) -> lists:member(N, ExcessLoop) end, SafeAdditions) ++ + (ExcessLoop -- SafeAdditions), + RemoveList = divide_list_for_removes( - lists:sublist(SafeList ++ ExcessLoop, RemoveCount), LoopCount), + lists:sublist(UpdSafeList, RemoveCount), LoopCount), RemoveLoops = length(RemoveList), case LoopCount > (2 * RemoveLoops) of @@ -842,7 +848,10 @@ score_for_adding(PreExcess, PostLoop, [HD|Rest], NodePositions, Shortfall) -> case lists:keyfind(HD, 1, PreExcess) of {HD, BS} -> lists:filter( - fun(P) -> (P + BS - 1) > length(PreExcess) end, + fun(P) -> + {A, B} = {(P + BS - 1), length(PreExcess)}, + A > B + end, lists:seq(1, Shortfall) ); false -> @@ -852,7 +861,10 @@ score_for_adding(PreExcess, PostLoop, [HD|Rest], NodePositions, Shortfall) -> case lists:keyfind(HD, 1, PostLoop) of {HD, FS} -> lists:filter( - fun(P) -> (FS + Shortfall - P) > length(PostLoop) end, + fun(P) -> + {A, B} = {(FS + Shortfall - P), length(PostLoop)}, + A > B + end, lists:seq(1, Shortfall)); false -> lists:seq(1, Shortfall) @@ -965,25 +977,34 @@ stripe_nodes_by_location(NodesByLocation) -> [LNodes|RestLNodes] = sort_lists_by_length( lists:map(fun({_L, NL}) -> NL end, dict:to_list(NodesByLocation))), - stripe_nodes_by_location(RestLNodes, lists:map(fun(N) -> [N] end, LNodes)). + stripe_nodes_by_location( + RestLNodes, + lists:map( + fun({I, L}) -> {1, I, L} end, + lists:zip( + lists:seq(1, length(LNodes)), + lists:map(fun(N) -> [N] end, LNodes)))). stripe_nodes_by_location([], Acc) -> - lists:flatten(lists:reverse(sort_lists_by_length(Acc))); + lists:flatten( + lists:map(fun({_L, _I, NL}) -> NL end, lists:sort(Acc)) + ); stripe_nodes_by_location([LNodes|OtherLNodes], Acc) -> - SortedAcc = lists:reverse(sort_lists_by_length(Acc)), + SortedAcc = lists:sort(Acc), {UpdatedAcc, []} = lists:mapfoldl( - fun(NodeList, LocationNodesToAdd) -> + fun({L, I, NodeList}, LocationNodesToAdd) -> case LocationNodesToAdd of [NodeToAdd|TailNodes] -> - {NodeList ++ [NodeToAdd], TailNodes}; + {{L + 1, I, NodeList ++ [NodeToAdd]}, TailNodes}; [] -> - {NodeList, []} + {{L, I, NodeList}, []} end end, LNodes, SortedAcc), stripe_nodes_by_location(OtherLNodes, UpdatedAcc). + sort_lists_by_length(ListOfLists) -> lists:sort(fun(L1, L2) -> length(L1) >= length(L2) end, ListOfLists). @@ -1044,11 +1065,11 @@ sort_list_t1_test() -> OtherLoc = [[l2n1, l2n2], [l3n1, l3n2], [l4n1, l4n2], [l5n1, l5n2], [l6n1], [l7n1], [l8n1]], - FirstLoc = [[l1n1], [l1n2]], + FirstLoc = [{1, 1, [l1n1]}, {1, 2, [l1n2]}], NodeList = stripe_nodes_by_location(OtherLoc, FirstLoc), ExpectedNodeList = - [l1n1, l2n2, l3n1, l4n2, l5n1, l7n1, - l1n2, l2n1, l3n2, l4n1, l5n2, l6n1, l8n1], + [l1n2, l2n2, l3n2, l4n2, l5n2, l7n1, + l1n1, l2n1, l3n1, l4n1, l5n1, l6n1, l8n1], ?assertMatch( ExpectedNodeList, NodeList ). @@ -1191,6 +1212,7 @@ location_seqclaim_t4_test() -> {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, {l6n5, loc6}, {l6n6, loc6}, {l6n7, loc6}, {l7n1, loc7}, {l7n2, loc7}, {l7n3, loc7}], + location_claim_tester(l1n1, loc1, JoiningNodes, 64), location_claim_tester(l1n1, loc1, JoiningNodes, 128), location_claim_tester(l1n1, loc1, JoiningNodes, 256), location_claim_tester(l1n1, loc1, JoiningNodes, 512), @@ -1243,6 +1265,17 @@ location_seqclaim_t7_test() -> location_claim_tester(l1n1, loc1, JoiningNodes, 1024), location_claim_tester(l1n1, loc1, JoiningNodes, 2048). +location_seqclaim_t8_test() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}], + location_claim_tester(l1n1, loc1, JoiningNodes, 256, sequential_claim, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, sequential_claim, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, sequential_claim, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, sequential_claim, 3). + location_claim_tester(N1, N1Loc, NodeLocList, RingSize) -> location_claim_tester( N1, N1Loc, NodeLocList, RingSize, sequential_claim, 4). @@ -1319,11 +1352,8 @@ location_multistage_t1_test_() -> location_multistage_t2_test_() -> {timeout, 60, fun location_multistage_t2_tester/0}. -% location_multistage_t3_test_() -> -% {timeout, 60, fun location_multistage_t3_tester/0}. - -location_multistage_t4_test_() -> - {timeout, 60, fun location_multistage_t4_tester/0}. +location_multistage_t3_test_() -> + {timeout, 60, fun location_multistage_t3_tester/0}. location_multistage_t1_tester() -> %% This is a tricky corner case where we would fail to meet TargetN for @@ -1359,23 +1389,7 @@ location_multistage_t2_tester() -> location_multistage_claim_tester(1024, JoiningNodes, 3, l4n7, loc4, 2), location_multistage_claim_tester(2048, JoiningNodes, 3, l4n7, loc4, 2). -% location_multistage_t3_tester() -> -% %% This is a minimal case for having TargetN locations, and an uneven -% %% Alloctaion around the locations. Is TargetN - 1 still held up -% JoiningNodes = -% [{l1n2, loc1}, -% {l2n3, loc2}, {l2n6, loc2}, -% {l3n4, loc3}, -% {l4n5, loc4} -% ], -% location_multistage_claim_tester(64, JoiningNodes, 4, l3n7, loc3, 3), -% location_multistage_claim_tester(128, JoiningNodes, 4, l3n7, loc3, 3), -% location_multistage_claim_tester(256, JoiningNodes, 4, l3n7, loc3, 3), -% location_multistage_claim_tester(512, JoiningNodes, 4, l3n7, loc3, 3), -% location_multistage_claim_tester(1024, JoiningNodes, 4, l3n7, loc3, 3), -% location_multistage_claim_tester(2048, JoiningNodes, 4, l3n7, loc3, 3). - -location_multistage_t4_tester() -> +location_multistage_t3_tester() -> JoiningNodes = [{l1n2, loc1}, {l2n3, loc2}, {l2n4, loc2}, @@ -1388,8 +1402,8 @@ location_multistage_t4_tester() -> location_multistage_claim_tester(128, JoiningNodes, 4, l5n10, loc5, 4), location_multistage_claim_tester(256, JoiningNodes, 4, l5n10, loc5, 4), location_multistage_claim_tester(512, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4). %, - % location_multistage_claim_tester(2048, JoiningNodes, 4, l5n10, loc5, 4). + location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4), + location_multistage_claim_tester(2048, JoiningNodes, 4, l5n10, loc5, 4). location_multistage_claim_tester( RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 1f6298337..59a60b13f 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -241,6 +241,7 @@ default_choose_params(Params) -> get_target_n() -> app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N). + %% =================================================================== %% Claim Function Implementations %% =================================================================== From bfe605b07744d946140d010e5bc90a12c454339f Mon Sep 17 00:00:00 2001 From: Thomas Arts Date: Wed, 3 May 2023 11:31:17 +0200 Subject: [PATCH 10/30] A new claim algorithm (#1003) * Support two transition changes Where the second transition is triggered by a change of location. Need to ensure that the location_changed status update is recognised in the ring * Unrelated fix to remove reference to gen_fsm_compat * unrelated fix to get rid of deprecation warning * Testing claim * The new claim algorithm as purely functional algorithm * add new entry for version 5 claiming * Refactor v5 into v4 * move impossible config test to place where we actually may enter recursion * Documentation The algorithm should be described in more detail in a markup document * Allow configurations with zero nodes in location for better placement update This works better when a location is emptied on nodes. Less transfers. * Keep order of nodes to avoid back translate issue --------- Co-authored-by: Martin Sumner --- docs/claim-version4.md | 358 ++++++ eqc/hashtree_eqc.erl | 8 +- src/riak_core.app.src | 2 +- src/riak_core_claim_binring_alg.erl | 707 ++++++++++++ src/riak_core_claim_location.erl | 1647 --------------------------- src/riak_core_claim_sim.erl | 2 +- src/riak_core_claim_swapping.erl | 796 +++++++++++++ src/riak_core_claimant.erl | 29 +- src/riak_core_membership_claim.erl | 312 +++-- src/riak_core_ring.erl | 11 +- src/riak_core_send_msg.erl | 3 +- test/riak_core_claim_eqc.erl | 519 +++++++++ 12 files changed, 2628 insertions(+), 1766 deletions(-) create mode 100644 docs/claim-version4.md create mode 100644 src/riak_core_claim_binring_alg.erl delete mode 100644 src/riak_core_claim_location.erl create mode 100644 src/riak_core_claim_swapping.erl create mode 100644 test/riak_core_claim_eqc.erl diff --git a/docs/claim-version4.md b/docs/claim-version4.md new file mode 100644 index 000000000..11040b4ca --- /dev/null +++ b/docs/claim-version4.md @@ -0,0 +1,358 @@ +# Riak Core Claim Version 4 + +This post is about a new version of riak core's claim algorithm +[riak-core's](https://github.com/basho/riak_core). + +An earlier post of [Russell](https://github.com/basho/riak_core/blob/develop/docs/claim-fixes.md) +describes the present claim algorithm (version 2) in detail. That post is +mainly about fixes performed to make things work with so called tail violations. + +Recent enhancement of riak core in the form of [location awareness](https://github.com/basho/riak_core/blob/develop/docs/rack-awareness.md) +have made it interesting to revisit and redesign the claim algorithm. + +Recapitulate from earlier posts that Riak is a replicated database. By default it stores three replicas of every key/value. The replication factor is called n-val meaning there are n replicas, and by default n=3. If value has to be stored or retrieved, then the hash of that corresponds to one particular node in the ring. It is stored in that node and the n-1 next nodes in the ring. When a node is unavailable, a value will be read from the next node. + +Therefore it is important to put the nodes in a ring such that n-val consecutive nodes are not the same physical node. After all, if a physical node is down and the next node in the ring maps to the same physical node, then there is little redundancy. + +Taking this idea one step further, one may imagine a perfect ring with n-val +replication, but two of the physical nodes are in the same "location", where +location can be a rack or a data center. What if something happens that disconnects +two consecutive physical nodes at once? Wouldn't it be nice if one could also +take the location into consideration when placing the nodes in the ring, such +that the ring conveniently spreads over locations? + +We came up with a way to do so. + +# Requirements + +The solution for placing the nodes in a ring is performed by a so called claim +algorithm. The user provides the nodes, the ring size and the n-val and from that +a mapping is returned that tries to fulfil the following: + +1. the ring has exactly ring size elements. +2. all nodes occur approximately equally often: this is called _balanced_. + (more precise for a given k, each node appears exactly k or k+1 times) +3. n-val consecutive nodes in the ring are all in a different location. + +Note that one node cannot be at two locations at the same time. Therefore, +the n-val for nodes follows from the n-val for locations. + +The first and second requirement cannot be relaxed. But the third one is not +guaranteed to be possible at all. In fact, Bb using a SAT solver we identified +82 impossible configuration for ring size 16 and n-val, 2 or 3. +In case there is no solution possible, the algorithm is supposed to return a +placement that fulfils the first two requirements and does some kind of best effort +for the third. + +In principle one can imagine to have an n-val for locations that is less than +the n-val for nodes, but although the presented algorithm supports this, we +recommend not to use that feature, since it may give unexpected placements that +may well behave worst than using the same but a smaller n-val. + + +# Computing placements + +We start of by presenting some examples to illustrate how the algorithm can be used. +The algorithm in [riak_core_claim_binring_alg](https://github.com/basho/riak_core/blob/develop/src/riak_core_claim_binring_alg.erl) is the core algorithm. You will most likely not +use the API of this module directly, but by isolating the algorithm, development +and testing becomes easier. +There are basically 2 API functions that matter: `solve` and `update`. +Central is the input __configuration__, presented as a list of integers, where Each +element in the list represents a location and the number represent the number of +nodes in that location. For example, `[1,1,2]` represents 3 locations, (A, B, and C, say) +such that the first and second location have 1 node and the third location has 2 nodes. + +## Solve + +Solving starts from a ring size, an n-val and a configuration and provides a +binary representation of a ring with a placement that fits that configuration. +Consider the binary representation as an opaque type, because there is no need to inspect it. +An API function "show" can be used to produce a string (ansi colour coded) from such a +binary ring (only exported for test and debug mode). + +In _test_ mode or when you want to get a better understanding of the algorithm, +solving the above [1,1,2] for ring size 16 and n-val 2 would be done as follows: +``` +BinRing = riak_core_claim_binring_alg:solve(16, [1,1,2], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing, 2)]). +B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 (0 violations) +``` +The location names are alphabetically and the nodes are numbered. +B1 is the first node in the second location. +By providing `show` also n-val it can return with `(0 violations)` given the +provided ring. + +## Update + +When Riak is running, then it has an existing placement of nodes and locations +in the ring. In that circumstance, one uses update to change the ring to a +new configuration. + +``` +Disclaimer: + +We have only considered updating the configuration. It would work to update the n-val. +But updating the ring size is something we have not spent brain cycles on. It might work. +``` + +One can add a new location with new nodes, or +add/remove nodes from existing locations. Again, a best-effort approach is provided. +In this best effort approach, the amount of transfers needed from one node to the other is +kept into consideration. + +### Adding a node to a location + +For example, if we update the ring above +``` +B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 +``` +with one extra node in the second location: +``` +BinRing1 = riak_core_claim_binring_alg:update(BinRing, [1,2,2], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing1, 2)]). +A1 B2 A1 B2 A1 C1 B1 C2 B1 C1 B1 C2 A1 C1 B2 C2 (0 violations) +``` +Clearly, the new ring is of size 16 and is balanced (4 A1, 3 B1, 3 B2, 3 C1 and 3 C2). +It respects n-val 2, because no consecutive location is the same, not even when +we wrap around. + +Another observation here is that 11 of the nodes have the same location in the +ring. Clearly, some transfer is needed, but if we had used the `solve` approach to +compute the new ring, we would have been presented with: +``` +A1 B1 C1 A1 B2 C2 B1 C1 A1 B2 C2 B1 C1 A1 B2 C2 +``` +In which only 4 nodes have the same place in the ring. +*Minimising the number of needed transfers* is the main reason for having the +`update` function. + +### Remove a node from a location (leave) + +We can use the same update function to remove a node from a location, which in +Riak terms is called a "leave". The node is removed from the ring data structure, +but the process of copying the data to create a new stable ring is a process +that takes time, only after which the node is actually removed. + +Assume we want to remove the node we have just added above. In other words, we +return to the initial configuration `[1, 1, 2]`: +``` +BinRing2 = riak_core_claim_binring_alg:update(BinRing1, [1,1,2], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing2, 2)]). +B1 C2 A1 C1 A1 C1 B1 C2 B1 C1 B1 C2 A1 C1 A1 C2 (0 violations) +``` +This does not give the same ring as the original placement, but close. +In order to minimise transfers, 12 nodes keep their position. + +### Leave a location + +In theory we can also add and leave nodes in one go. This is probably not something +one would like to do in operation, but the algorithm allows it. + +For example if we update the ring above by moving one of the single nodes to +the other location with a single node: +``` +NewBinRing = riak_core_claim_binring_alg:update(BinRing, [2,2], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(NewBinRing, 2)]). +B1 A2 B2 A1 B2 A1 B2 A2 B1 A1 B2 A2 B1 A1 B1 A2 (0 violations) +``` +But that result is confusing, because now we have location A and B, but the +intention was to keep location C and move a node from B to A (or alternatively from A to B). + +We can patch the confusion in the [Embedding layer using this algorithm](#embedding-the-algorithm-in-riak-core) +where we translate real node names and translations back and forth to these configurations. +But that layer becomes easier if we actually state our intentions clearly and +have a layer with zero nodes in the provided configuration: +``` +NewBinRing = riak_core_claim_binring_alg:update(BinRing, [2,0,2], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(NewBinRing, 2)]). +A2 C2 A1 C1 A1 C1 A2 C2 A1 C1 A2 C2 A1 C1 A2 C2 (0 violations) +``` +If we compare that to the original placement: +``` +B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 +``` +we see that the nodes in location C have not changed, but that B1 is replaced by A2. + +# Embedding the algorithm in riak core + +In Riak the claim algorithm is configurable via `wants_claim_fun` and `choose_claim_fun`. +In order to run with this new algorithm, one should configure `choose_claim_fun` +to `choose_claim_v4`. We do not use the wants function, but `riak_core_membership_claim` +requires to have one, so use the default for version 2. + + +The main entry for claim is `riak_core_membership_claim:claim/1`. +This in turn calls `riak_code_claim_swapping:choose_claim_v4/3`. This is just a +wrapper to come to the real API, `riak_code_claim_swapping:claim/2` which takes +the present ring and the n-val as input. + +Riak always starts from an existing ring to compute the placement +(in case of a new node, the initial consists of that same node at each position). +Therefore, we start with an update... if however `update` cannot find a solution without +violations, we fall back to `solve`. + +### Mapping node/location names + +The main work in `riak_code_claim_swapping` is to map the real node names and the +real locations to the configurations we provide the algorithm. + +A typical ring will contain node names as atoms and location names associated to +those atoms. For example, one could have a ring of size 16 like this: +``` +n2 n4 n1 n3 n1 n3 n2 n4 n1 n3 n2 n4 n1 n3 n2 n4 +``` +with the mapping `[{n1, loc1}, {n2, loc2}, {n3, loc3}, {n4, loc4}]`. +We use this to create a list of tuples with location index and node index, +something like: +``` +[{2, 1}, {3, 2}, {1,1}, {3,1}, {1,1}, {3,1}, {2,1}, {3,2}, + {1, 1}, {3, 1}, {2,1}, {3,2}, {1,1}, {3,1}, {2,1}, {3,2}] +``` +where the second integer is the index of the node in that location. +This corresponds to: +``` +B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 +``` +With the function `riak_core_claim_binring_alg:from_list` we generate +the ring in the binary form that the algorithm needs for the update function. + +The update function now also wants the new location, then computes, as described +above a new ring, which we translate back into a list of tuples via +`riak_core_claim_binring_alg:to_list`. + +The challenge is to make sure the right indices map to the right node names! +Because, what if we want to remove, say node `n3`. +The configuration that we compute from the riak ring object, in which the action +`leave n3` is present, is clearly `[1, 1, 1]`. +When we run update, the computed ring is: +``` +R1 = riak_core_claim_binring_alg:update(BinRing, [1,1,1], 2). +io:format("~s\n", [riak_core_claim_binring_alg:show(R1, 2)]). +C1 B1 A1 B1 A1 C1 A1 C1 A1 C1 B1 A1 B1 C1 B1 A1 (0 violations) +``` +But it is easy to be mislead that C1 here is in fact `n4` and not `n3` as it was before. +Our solution here is to compute the mapping function together with the binary ring +in such a way that leaving nodes have a higher index than nodes that do not leave. +So, instead we use the mapping `[{loc1, [n1]}, {loc2, [n2]}, {loc3, [n4,n3]}]` to compute +the indexes for the binary ring, which then swaps `{3,1}` and `{3,2}`) and maps `n4` to C1: +``` +[{2, 1}, {3, 1}, {1,1}, {3,2}, {1,1}, {3,2}, {2,1}, {3,1}, + {1, 1}, {3, 2}, {2,1}, {3,1}, {1,1}, {3,2}, {2,1}, {3,1}] +``` +Using this ring in update, gives the resulting ring: +``` +A1 C1 A1 B1 C1 A1 B1 C1 A1 C1 B1 A1 B1 A1 B1 C1 (0 violations) +``` +which easily translates back to: +``` +n1 n4 n1 n2 n4 n1 n2 n4 n1 n4 n2 n1 n2 n1 n2 n4 +``` +where `n3` is indeed removed. (This typical example unfortunately requires a lot of transfers.) + + +# Legacy: no locations + +The claim algorithms version 1 to 3 that have been used in Riak before, do not consider +locations. There the goal is to just consider the n-val for nodes. The new algorithm also +supports that, such that if you have no locations, you can use this newer +algorithm. In fact, you can just configure to use this new claim algorithm and +run as usual. The module `riak_code_claim_swapping` checks whether you have defined +locations and if not, it puts all the nodes in one location. + +Effectively, the `solve` and `update` function are called with `{NVal, 1}` instead +of `NVal` as argument, where the second element of the tuple is the location n-val. +``` +BinRing = riak_core_claim_binring_alg:solve(16, [4], {2,1}). +io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing, {2, 1})]). +A3 A1 A2 A4 A1 A2 A3 A4 A1 A2 A3 A4 A1 A2 A3 A4 (0 violations) +``` + +## Do not consider different n-vals + +In principle, one could use different n-val values for location and nodes, +for example use n-val 4 for nodes, but n-val 3 for locations. This, though, seems +not to have a valuable use case other than the above 1 location case. + +For example, take a configuration for which it is hard to find a solution and +therefore the best-effort approach comes back with violations. We try to find a +ring of size 32 with n-val 3 (both nodes and location) and 3 locations with 2 nodes each: +``` +BadRing = riak_core_claim_binring_alg:solve(32, [2,2,2], {3,3}). +io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 3})]). +A1 B1 C2 A1 B1 C1 A2 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (4 violations) +``` +This ring has 4 violations, but if we examine it carefully, we can see that it +satisfies n-val 2 for locations. This might be acceptable risk, although not perfect. +``` +io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 2})]). +A1 B1 C2 A1 B1 C1 A2 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (0 violations) +``` + +In this case we try to solve for the ideal case, we fail and inspect and decide +that it is good enough for a relaxed case. However, if we directly would have tried +to compute for the relaxed case, we would have found: +``` +BadRing = riak_core_claim_binring_alg:solve(32, [2,2,2], {3,2}). +io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 2})]). +A1 B1 A2 C2 A1 B1 C1 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (0 violations) +``` +But when we inspect this ring in more detail, we see that it has 8 violations +when considering the more difficult target `{3, 3}`. +``` +io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 3})]). +A1 B1 A2 C2 A1 B1 C1 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (8 violations) +``` +Therefore, it seems that +this solution is worst than taking the best-effort solution and inspect how it +behaves for relaxed requirements. + + +# Solvable configurations + +As we see above, the algorithm may fail to satisfy the provided n-val. In fact, +there are many configurations that are simply impossible to solve. Trivially +when the number of locations is smaller than the n-val, etc. But excluding those +trivial cases, we played with a +SAT solver to find 82 *impossible configurations* with ring size 16 and n-val 2 or 3. +This resulted in some necessary requirements to be able to find a solution at all, +which we use in [QuickCheck tests](../test/riak_core_claim_eqc.erl#L261) to avoid +testing the wrong things. + +Here we present some rules of thumb for good start configurations and typically +more successful update configurations. + +In general, a larger ring size is easier to solve than a small ring size. We simply +have more play room to swap nodes to get to a solution. But note that it is more +computation intensive when the ring size grows. + +Distributing the nodes evenly over the location makes it more likely to find a solution. +For a realistic example with ring size 512, n-val 4 and 4 locations which 3 nodes each, +we easily find a solution, similar when we put 2 nodes each in the 4 locations. +But the configuration `[3,3,3,4]` has no solution. In that case it actually works +to put the extra node in a different location. + +In general, adding an extra location and having more locations than n-val makes +it easier to find a solution. With ring size 512 and n-val 3 a solution for `[2, 2, 2, 2]` +is quickly found, but the best-effort solution for `[3, 3, 3]` has 4 violations. +So, even if there are 9 nodes in the latter configuration and only 8 in the earlier, +it is harder to find a placement. + + +# Conclusion + +The new algorithm for node placement in a ring handles the case where location +is an additional property to consider in a Riak installation. It is backward +compatible with the situation in which no locations are considered at all. + +The algorithm handles both addition of new nodes, in same or new locations, as well +as nodes leaving the ring. + +The algorithm has an inherent high complexity and can take a long time to come up +with a solution. Since the algorithm is only used when planning a new configuration +for a Riak installation, we find it acceptable that one needs to wait upto one or +two minutes for a solution. In fact, one only needs to wait long when it is hard +to find a solution. We provided some rules of thumb to provide configurations that +are relatively easy to solve. + +This algorithm will be released with the next version of Riak we create +for NHS-Digital. diff --git a/eqc/hashtree_eqc.erl b/eqc/hashtree_eqc.erl index 48f1368bf..d78b1b280 100644 --- a/eqc/hashtree_eqc.erl +++ b/eqc/hashtree_eqc.erl @@ -190,7 +190,7 @@ command(_S = #state{started = true, tree_id = TreeId, %% to make sure the iterator code is fully exercised. %% %% Store the hashtree records in the process dictionary under keys 't1' and 't2'. -%% +%% start(Params, [TreeId | ExtraIds], Tree1OpenOrEmpty, Tree2OpenOrEmpty) -> {Segments, Width, MemLevels} = Params, %% Return now so we can store symbolic value in procdict in next_state call @@ -271,7 +271,7 @@ update_snapshot(T, S) -> ok. -%% +%% %% Wrap the hashtree:update_perform call and erase the snapshot hashtree state. %% Should only happen if a snapshot state exists. %% @@ -490,7 +490,7 @@ next_state(S,_R,{call, _, local_compare1, []}) -> %% prop_correct() -> ?SETUP(fun() -> - application:set_env(lager, handlers, [{lager_console_backend, info}]), + application:set_env(lager, handlers, [{level, info}]), application:ensure_started(syntax_tools), application:ensure_started(compiler), application:ensure_started(goldrush), @@ -531,7 +531,7 @@ prop_correct() -> Res0 end, %% Clean up after the test - case Res of + case Res of ok -> % if all went well, remove leveldb files catch cleanup_hashtree(get(t1)), catch cleanup_hashtree(get(t2)); diff --git a/src/riak_core.app.src b/src/riak_core.app.src index 89f0d8420..68f9d68df 100644 --- a/src/riak_core.app.src +++ b/src/riak_core.app.src @@ -45,7 +45,7 @@ {wants_claim_fun, {riak_core_membership_claim, default_wants_claim}}, {choose_claim_fun, - {riak_core_memberhsip_claim, default_choose_claim}}, + {riak_core_membership_claim, default_choose_claim}}, %% Vnode inactivity timeout (how often to check if fallback vnodes %% should return their data) in ms. diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl new file mode 100644 index 000000000..640b02780 --- /dev/null +++ b/src/riak_core_claim_binring_alg.erl @@ -0,0 +1,707 @@ +%% ------------------------------------------------------------------- +%% +%% riak_core: Core Riak Application +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- +%% +%% This is a purely functional algorithm for claiming nodes in a +%% ring. The reason to separate a pure algorithm is to allow +%% well tested code with subtle future changes. +%% +%% The algorithm can be studied in isolation to make it easier +%% to understand how it works. +%% +%% This algorithm computes a layout of nodes in a ring, such that +%% when data is replicated on TargetN consecutive nodes in that ring, +%% the consecutive nodes never co-incide. +%% +%% Riak has always had the possibility to compute such placements, +%% but when adding location awareness, things get a bit tricky. +%% Now we do not have just one location "A", but possibly 2 locations, +%% A and B. Such that we do never want two adjacent locations. +%% With ringsize 16 and 2 nodes for each location A and B a solution +%% would be: +%% A2 B2 A1 B1 A1 B1 A2 B2 A1 B1 A2 B2 A1 B1 A2 B2 +%% Where an important additional property of this algorithm is that +%% none of the nodes is over represented. That is, each node occurs +%% either N or N+1 times in the list. We would violate this +%% requirement if 2 A1 nodes are replaced by an A2. +%% +%% Thus, the algorithm computes, given a configuration of nodes, +%% a ring size and a TargetN, a placement in a ring. +%% +%% We refer to the markup document in the docs directory for more +%% details on the use of the algorithm. +%% +%% Below we describe the actual algorithm in more detail. +%% +%% The algorithm is brute-force trying to get to a placement given +%% a ring size, target n-val and configuration of number of nodes in +%% each location. +%% +%% Step 1. +%% Since the ring should be balanced, the initial step is to +%% put all the given nodes in a sequence such that n-val is met. +%% For example, if there are 7 nodes in two locations (say 3 and 4 +%% respectively) then the algorithm first places those 7 nodes in +%% the best way it can w.r.t. the given n-val. Clearly, if we have +%% only 2 locations, n-val should be at most 2. But then, there are +%% solutions that place those 7 nodes such that the locations (and +%% therewith the nodes) are not consecutive, even when wrapping around. +%% +%% Step 2. +%% Repeat this small ring as often as possible in the ringsize. +%% Thus, if ring size is 32 and we have 7 nodes, then we can repeat the +%% small ring 4 times and are 4 nodes short. +%% It would be unrealistic to provide a ring size that is less than +%% the number of nodes one provides. So assume it to fit always at least +%% once. +%% +%% Step 3. +%% Fill the gaps with additional nodes (part of the small ring) if needed +%% to get to full ring size. +%% While n-val not reached (zero_violations is false): +%% swap nodes (exchange position) or move nodes +%% (moving vnode I to before vnode J). +%% +%% Step 3 gives a best effort solution, but given the enormous amount of +%% possible operations, it can take while to return. But it always terminate. +%% +%% When we update a ring, then we want as little transfers as possible, +%% so first an effort is performed to just swap nodes. If that would not +%% work to get a solution, a brute-force attempt is taken to get best-effort +%% again. + + +-module(riak_core_claim_binring_alg). + +-export([solve/3, + update/3, + zero_violations/2, + moves/2, + to_list/1, from_list/1]). + +-ifdef(TEST). +-compile([export_all, nowarn_export_all]). +-define(DEBUG_FUNS, true). +-ifdef(EQC). +-include_lib("eqc/include/eqc.hrl"). +-endif. +-endif. + +-ifdef(DEBUG). +-compile([export_all, nowarn_export_all]). +-define(DEBUG_FUNS). +-define(PROFILE, true). +-include_lib("eqc/include/eqc_profile.hrl"). +-define(debug(Fmt, Args), io:format(Fmt, Args)). +-else. +-define(BENCHMARK(_, X), X). +-define(debug(Fmt, Args), ok). +-endif. + +%% -- Ring representation ------------------------------------------------- + +%% Represent the ring as a binary with one byte location index followed by one +%% byte node index. +-type ring() :: binary(). +-type ring_size() :: non_neg_integer(). +-type nval() :: non_neg_integer(). +-type node_nval() :: nval(). +-type loc_nval() :: nval(). +-type nvals() :: nval() | {node_nval(), loc_nval()}. + +-type config() :: [non_neg_integer()]. %% List of node counts per location + +-type loc() :: byte(). +-type ix() :: byte(). +-type pnode() :: {loc(), ix()}. %% Physical node +-type vnode() :: non_neg_integer(). %% Virtual node (index in ring) + +-spec ring_size(ring()) -> non_neg_integer(). +ring_size(Ring) -> byte_size(Ring) div 2. + +-spec from_list([pnode()]) -> ring(). +from_list(Nodes) -> << <> || {Loc, Ix} <- Nodes >>. + +-spec to_list(ring()) -> [pnode()]. +to_list(Ring) -> [ {Loc, Ix} || <> <= Ring ]. + +-spec get_node(ring(), vnode()) -> pnode(). +get_node(Ring, Ix) -> hd(window(Ring, Ix, 1)). + +-spec set_node(ring(), vnode(), pnode()) -> ring(). +set_node(Ring, VNode, {Loc, Ix}) -> + B = VNode * 2, + <> = Ring, + <>. + +%% Insert a node at the given vnode, making the ring one bigger. +-spec insert_node(ring(), vnode(), pnode()) -> ring(). +insert_node(Ring, VNode, {Loc, Ix}) -> + B = VNode * 2, + <> = Ring, + <>. + +-spec delete_node(ring(), vnode()) -> ring(). +delete_node(Ring, VNode) -> + B = VNode * 2, + <> = Ring, + <>. + +%% Return a window of size 2 * NVal - 1 centered on the given vnode. +-spec window(ring(), vnode(), nval()) -> [pnode()]. +window(Ring, Ix, NVal) -> + Size = ring_size(Ring), + Len = 2 * NVal - 1, + if Len > Size -> window(<>, Ix, NVal); + true -> + Lo = Ix - NVal + 1, + Hi = Lo + Len, + Win = if Lo < 0 -> <<(slice(Lo + Size, -Lo, Ring))/binary, + (slice(0, Hi, Ring))/binary>>; + Hi > Size -> <<(slice(Lo, Size - Lo, Ring))/binary, + (slice(0, Hi - Size, Ring))/binary>>; + true -> slice(Lo, Len, Ring) + end, + to_list(Win) + end. + +-spec slice(non_neg_integer(), non_neg_integer(), binary()) -> binary(). +slice(Addr, Len, Bin) -> + A = Addr * 2, + L = Len * 2, + <<_:A/binary, Res:L/binary, _/binary>> = Bin, + Res. + +-spec moves(ring(), ring()) -> non_neg_integer(). +moves(Ring1, Ring2) -> + length([ 1 || {N1, N2} <- lists:zip(to_list(Ring1), to_list(Ring2)), N1 /= N2 ]). + +%% -- NVal condition ------------------------------------------------------ + +-type violations() :: {non_neg_integer(), non_neg_integer()}. + +-define(zero_v, {0, 0}). +-define(is_zero_v(V), element(1, V) == 0 andalso element(2, V) == 0). + +zip_v(F, {A1, B1}, {A2, B2}) -> {F(A1, A2), F(B1, B2)}; +zip_v(F, {A1, B1, C1}, {A2, B2, C2}) -> {F(A1, A2), F(B1, B2), F(C1, C2)}; +zip_v(F, {A1, B1, C1, D1}, {A2, B2, C2, D2}) -> {F(A1, A2), F(B1, B2), F(C1, C2), F(D1, D2)}. + +-spec add_v(violations(), violations()) -> violations(). +add_v(V1, V2) -> zip_v(fun erlang:'+'/2, V1, V2). + +-spec sub_v(violations(), violations()) -> violations(). +sub_v(V1, V2) -> zip_v(fun erlang:'-'/2, V1, V2). + +-spec sum_v([violations()]) -> violations(). +sum_v(Vs) -> lists:foldl(fun add_v/2, ?zero_v, Vs). + +-spec zero_violations(ring(), nvals()) -> boolean(). +zero_violations(Ring, NVals) -> + V = violations(Ring, NVals), + ?is_zero_v(V). + +%% What's the maximum distance from an updated vnode where a violation change +%% can happen. +-spec max_violation_dist(nvals()) -> non_neg_integer(). +max_violation_dist({N, L}) -> max(N, L); +max_violation_dist(N) -> N. + +-spec violations(ring(), nvals()) -> violations(). +violations(Ring, NVals) -> + violations(Ring, NVals, 0, ring_size(Ring) - 1). + +-spec violations(ring(), nvals(), vnode(), vnode()) -> violations(). +violations(Ring, NVals, A, B) -> + violations(Ring, NVals, lists:seq(A, B)). + +%% Returns number of node and location violations caused by the given vnode. +-spec violations(ring(), nvals(), vnode() | [vnode()]) -> violations(). +violations(Ring, NVals, VNodes) when is_list(VNodes) -> + sum_v([ violations(Ring, NVals, I) || I <- VNodes ]); +violations(Ring, NVals, VNode) -> + ?BENCHMARK(violations, begin + {NVal, LVal} = case NVals of + {N, L} -> {N, L}; + N -> {N, N} + end, + Locs = fun(Ns) -> [ L || {L, _} <- Ns ] end, + NV = window_violations( window(Ring, VNode, NVal), NVal), + + LocV = fun(D) -> window_violations(Locs(window(Ring, VNode, LVal + D)), LVal + D) end, + LV = LocV(0), + {LV, NV} + end). + +%% Given a window of size 2 * NVal - 1 centered on an element X, count the +%% number of collisions with X in the slices of size NVal. For example +%% window_violations([1, 0, 1, 1, 1], 3) == 4 because of these 4 collisions: +%% [1, 0, 1] [0, 1, 1] [1, 1, 1] +%% ! * * ! * ! ! +%% (where ! marks a collision and * marks the center element) +window_violations(Win, NVal) -> + window_violations(Win, 0, NVal). + +%% Ignore violations inside the cut-off (i.e. distance to the center =< CutOff). +window_violations(Win, CutOff, NVal) -> + Masked = lists:zip(Win, lists:duplicate(NVal - 1 - CutOff, check) + ++ lists:duplicate(CutOff, skip) + ++ [original] + ++ lists:duplicate(CutOff, skip) + ++ lists:duplicate(NVal - 1 - CutOff, check)), + X = lists:nth(NVal, Win), + Windows = [ lists:sublist(Masked, I, NVal) || I <- lists:seq(1, length(Win) - NVal + 1) ], + length([ X || W <- Windows + , not lists:member({X, skip}, W) %% If we have a skipped collision we don't care about other collisions + , {Y, check} <- W, X == Y ]). + +%% -- Node count allocation ----------------------------------------------- + +-spec nodes_in_config(config()) -> [pnode()]. +nodes_in_config(Locs) -> + [ {L, I} + || {I, _, L} <- lists:sort( + [ {I, -N, L} + || {L, N} <- enumerate(Locs) + , I <- lists:seq(1, N) ]) + ]. + +enumerate(Xs) -> lists:zip(lists:seq(1, length(Xs)), Xs). + +%% When ring size is not divisible by the number of nodes, some nodes need to +%% occur an extra time in the ring. We pick those from the smaller locations to +%% make locations as balanced as possible. +-spec extra_nodes(ring_size(), config()) -> [pnode()]. +extra_nodes(RingSize, Config) -> + NumNodes = lists:sum(Config), + Extra = RingSize rem NumNodes, + Count = RingSize div NumNodes, + distribute_extra_nodes(lists:sort([ {Count * N, L, 1, N} || {L, N} <- enumerate(Config) ]), Extra). + +distribute_extra_nodes(_, 0) -> []; +distribute_extra_nodes([{_, _, Ix, Num} | Locs], Extra) when Ix > Num -> + distribute_extra_nodes(Locs, Extra); +distribute_extra_nodes([{Total, Loc, Ix, Num} | Locs], Extra) -> + Entry = {Total + 1, Loc, Ix + 1, Num}, + [{Loc, Ix} | distribute_extra_nodes(lists:merge([Entry], Locs), Extra - 1)]. + +%% -- Brute force node swapping ------------------------------------------- + +brute_force(Ring, NVals) -> + brute_force(Ring, NVals, []). + +brute_force(Ring, NVals, Options) -> + ?BENCHMARK(brute_force, + brute_force(Ring, NVals, Options, violations(Ring, NVals))). + +brute_force(Ring, NVals, Options, V) -> + TryHard = proplists:get_bool(try_hard, Options), + case V of + _ when not TryHard, ?is_zero_v(V) -> Ring; + ?zero_v -> Ring; + _ -> + N = ring_size(Ring), + %% TODO: keep swaps so we don't start over every time (earlier swaps are less likely to work) + Swaps = [ {swap, I, J} || I <- lists:seq(0, N - 2), J <- lists:seq(I, N - 1) ] ++ + lists:sort(fun({move, I1, J1}, {move, I2, J2}) -> abs(I1 - J1) =< abs(I2 - J2) end, + [ {move, I, J} || not proplists:get_bool(only_swap, Options) + , I <- lists:seq(0, N - 1), J <- lists:seq(0, N - 1) + , D <- [mod_dist(J, I, N)] + , D > 2 orelse D < -1 %% Moving just one step is a swap + ]), + brute_force(Ring, NVals, V, Options, Ring, ?zero_v, Swaps) + end. + +mod_dist(I, J, N) -> + D = (J - I + N) rem N, + if D * 2 > N -> D - N; + true -> D + end. + +%% TODO: Don't use DeltaV for BestV (total violations instead) +brute_force(_Ring, NVals, V, Options, Best, BestV, []) when BestV < ?zero_v -> + ?debug("~s\n", [show(Best, NVals)]), + brute_force(Best, NVals, Options, add_v(V, BestV)); +brute_force(_Ring, _NVals, _V, _Options, Best, _BestV, []) -> Best; +brute_force(Ring, NVals, V, Options, Best, BestV, [Op | Swaps]) -> + {Ring1, DV} = op(Ring, NVals, Op), + TryHard = proplists:get_bool(try_hard, Options), + if DV < ?zero_v, not TryHard -> + ?debug("~s\n", [show(Ring1, NVals)]), + brute_force(Ring1, NVals, Options, add_v(V, DV)); + DV < BestV -> + brute_force(Ring, NVals, V, Options, Ring1, DV, Swaps); + true -> + brute_force(Ring, NVals, V, Options, Best, BestV, Swaps) + end. + +op(Ring, NVals, {swap, I, J}) -> + ?BENCHMARK(swap, begin + Ring1 = swap(Ring, I, J), + OldV = violations(Ring, NVals, [I, J]), + NewV = violations(Ring1, NVals, [I, J]), + DV = sub_v(NewV, OldV), + %% Each violation is double-counted when we sum the entire ring + {Ring1, add_v(DV, DV)} + end); +op(Ring, NVals, {move, I, J}) -> + ?BENCHMARK(move, begin + %% {move, I, J} means moving vnode I to before vnode J + Ring1 = move(Ring, I, J), + N = ring_size(Ring), + NVal = max_violation_dist(NVals), + %% To compute the delta violations we figure out which vnodes in the original + %% ring are affected by the move. These are the vnodes within NVal - 1 of the + %% source or destination, except to the right of the destination where only + %% NVal - 2 nodes are affected. + OldIxs = lists:usort([ (K + 10 * N) rem N || K <- lists:seq(I - NVal + 1, I + NVal - 1) ++ + lists:seq(J - NVal + 1, J + NVal - 2) ]), + %% We need to compare the violations before and after for the affected + %% indices, so we need to know where they end up in the new ring. Only + %% indices between I and J are affected. + Remap = fun(K) when K >= J, K < I -> K + 1; %% {J = a .. K = * .. I = x} -> {J = x, J+1 = a, .. K+1 = *} + (K) when K > I, K < J -> K - 1; %% {I = x .. K = * .. J = a} -> {K-1 = *, J-1 = x, J = a} + (K) when K == I, I < J -> J - 1; %% {I = * .. J = a} -> {J-1 = *, J = a} + (K) when K == I, J < I -> J; %% {J = a .. I = *} -> {J = *, J+1 = a ..} + (K) -> K + end, + NewIxs = lists:map(Remap, OldIxs), + OldV = violations(Ring, NVals, OldIxs), + NewV = violations(Ring1, NVals, NewIxs), + DV = sub_v(NewV, OldV), + {Ring1, DV} + end). + +move(Ring, I, I) -> Ring; +move(Ring, I, J) -> + Node = get_node(Ring, I), + if I < J -> insert_node(delete_node(Ring, I), J - 1, Node); + true -> insert_node(delete_node(Ring, I), J, Node) + end. + +swap(Ring, I, J) -> + X = get_node(Ring, I), + Y = get_node(Ring, J), + set_node(set_node(Ring, I, Y), J, X). + +%% -- The solver ---------------------------------------------------------- + +-spec solve(ring_size(), config(), nvals()) -> ring(). +solve(RingSize, Config, NVals) -> + NumNodes = lists:sum(Config), + Rounds = RingSize div NumNodes, + AllNodes = nodes_in_config(Config), + SmallRing = small_ring(AllNodes, NVals), + ?debug("SmallRing:\n~s\n", [show(SmallRing, NVals)]), + Extras = extra_nodes(RingSize, Config), + Cycle = fun(R) -> << <> || _ <- lists:seq(1, R) >> end, + ToRemove = AllNodes -- Extras, + BigRingD = solve_node_deletions(Cycle(Rounds + 1), NVals, ToRemove), + VD = violations(BigRingD, NVals), + ?debug("Delete\n~s\n", [show(BigRingD, NVals)]), + case VD of + ?zero_v -> brute_force(BigRingD, NVals); + _ -> + BigRingI = solve_node_insertions(Cycle(Rounds), NVals, Extras), + ?debug("Insert\n~s\n", [show(BigRingI, NVals)]), + VI = violations(BigRingI, NVals), + if VI < VD -> + ?debug("Chose insert\n", []), + brute_force(BigRingI, NVals); + true -> + ?debug("Chose delete\n", []), + brute_force(BigRingD, NVals) + end + end. + +%% The "small ring" is the solution when RingSize == NumNodes. If we can solve +%% that we can repeat that pattern without introducing any violations. If we +%% can't solve the small ring, rather than going with the best non-solution we +%% add a fake location with a single node and try to solve that instead +%% (inserting more and more fake locations until we get a solution). These fake +%% nodes are stripped before we return. +%% The rationale for this is that we get something where inserting more nodes +%% can produce a solution, and for the big ring we do need to insert extra +%% nodes if NumNodes is not a power of two. +small_ring(AllNodes, NVals) -> + small_ring(AllNodes, NVals, -1). + +small_ring(AllNodes, NVals, FakeLoc) -> + SmallRing = brute_force(from_list(AllNodes), NVals, [try_hard]), + case violations(SmallRing, NVals) of + V when ?is_zero_v(V) -> + [ ?debug("SmallRing (with fakes)\n~s\n", [show(SmallRing, NVals)]) || FakeLoc < -1 ], + remove_fake(SmallRing); + _ -> small_ring([{FakeLoc, 1} | AllNodes], NVals, FakeLoc - 1) + end. + +remove_fake(Ring) -> + from_list([ Node || Node = {Loc, _} <- to_list(Ring), Loc < 128 ]). + +solve_node_insertions(Ring, NVals, Nodes) -> + lists:foldl(fun(N, R) -> solve_node_insertion(R, NVals, N) end, + Ring, Nodes). + +solve_node_insertion(Ring, NVals, Node) -> + solve_node_insertion(Ring, NVals, Node, 0, ring_size(Ring), undefined, undefined). + +solve_node_insertion(_, _, _, I, Size, BestR, _) when I >= Size -> BestR; +solve_node_insertion(Ring, NVals, Node, I, Size, BestR, BestV) -> + Ring1 = insert_node(Ring, I, Node), + V = violations(Ring1, NVals), %% TODO: recompute local violation changes + if BestV == undefined; V < BestV -> + solve_node_insertion(Ring, NVals, Node, I + 1, Size, Ring1, V); + true -> + solve_node_insertion(Ring, NVals, Node, I + 1, Size, BestR, BestV) + end. + +solve_node_deletions(Ring, NVals, Nodes) -> + lists:foldl(fun(N, R) -> solve_node_deletion(R, NVals, N) end, + Ring, Nodes). + +solve_node_deletion(Ring, NVals, Node) -> + solve_node_deletion(Ring, NVals, Node, 0, ring_size(Ring), undefined, undefined). + +solve_node_deletion(_, _, _, I, Size, BestR, _) when I >= Size -> BestR; +solve_node_deletion(Ring, NVals, Node, I, Size, BestR, BestV) -> + case get_node(Ring, I) == Node of + false -> solve_node_deletion(Ring, NVals, Node, I + 1, Size, BestR, BestV); + true -> + Ring1 = delete_node(Ring, I), + V = violations(Ring1, NVals), %% TODO: recompute local violation changes + if BestV == undefined; V < BestV -> solve_node_deletion(Ring, NVals, Node, I + 1, Size, Ring1, V); + true -> solve_node_deletion(Ring, NVals, Node, I + 1, Size, BestR, BestV) + end + end. + +%% -- Updating ------------------------------------------------------------ + +nodes_in_ring(RingSize, Config) -> + X = RingSize div lists:sum(Config), + lists:append(lists:duplicate(X, nodes_in_config(Config))) ++ extra_nodes(RingSize, Config). + +-spec update(ring(), config(), nvals()) -> ring(). +update(OldRing, Config, NVals) -> + %% Diff old and new config + RingSize = ring_size(OldRing), + OldNodes = to_list(OldRing), + NewNodes = nodes_in_ring(RingSize, Config), + ToAdd = NewNodes -- OldNodes, + ToRemove = OldNodes -- NewNodes, + %% Swap in new nodes for old nodes (in a moderately clever way) + NewRing = swap_in_nodes(OldRing, ToAdd, ToRemove, NVals), + %% Brute force fix any remaining conflicts + brute_force(NewRing, NVals, []). + +swap_in_nodes(Ring, [], [], _NVals) -> Ring; +swap_in_nodes(Ring, [New | ToAdd], ToRemove, NVals) -> + {Ring1, Removed} = find_swap(Ring, New, ToRemove, NVals), + swap_in_nodes(Ring1, ToAdd, ToRemove -- [Removed], NVals). + +find_swap(Ring, New, ToRemove, NVals) -> + Swap = fun(I) -> + Old = get_node(Ring, I), + [ begin + Ring1 = set_node(Ring, I, New), + V = violations(Ring1, NVals, I), + {V, Ring1, Old} + end || lists:member(Old, ToRemove) ] + end, + {_V, Ring1, Removed} = lists:min(lists:flatmap(Swap, lists:seq(0, ring_size(Ring) - 1))), + {Ring1, Removed}. + +%% -- Debugging ----------------------------------------------------------- + +-ifdef(DEBUG_FUNS). +pp_violations({L, N}) -> pp_violations({L, N, 0}); +pp_violations({L, N, L1}) -> pp_violations({L, N, L1, 0}); +pp_violations({L, N, A, B}) -> + [ io_lib:format("~p", [L]) + , [ io_lib:format(" + ~pn", [N]) || N /= 0 ] + , [ io_lib:format(" + ~pa", [A]) || A /= 0 ] + , [ io_lib:format(" + ~pb", [B]) || B /= 0 ] + ]. + +show(Ring, NVals) -> + Color = fun(?zero_v, S) -> S; + (V, S) when ?is_zero_v(V) -> "\e[34m" ++ S ++ "\e[0m"; + (_, S) -> "\e[31m" ++ S ++ "\e[0m" end, + TotalV = violations(Ring, NVals), + Vs = [ violations(Ring, NVals, I) || I <- lists:seq(0, ring_size(Ring) - 1) ], + lists:flatten(io_lib:format("~s(~s violations)", + [ [io_lib:format(Color(V, "~c~p "), [L + $A - 1, I]) || {{L, I}, V} <- lists:zip(to_list(Ring), Vs)] + , pp_violations(TotalV) ])). + +show_solve(RingSize, Config, NVals) -> + io:format("~s\n", [show(solve(RingSize, Config, NVals), NVals)]). + +show_update(RingSize, OldConfig, NewConfig, NVals) -> + OldRing = solve(RingSize, OldConfig, NVals), + NewRing = update(OldRing, NewConfig, NVals), + io:format("Old\n~s\nNew\n~s\nDiff=~p\n", [show(OldRing, NVals), show(NewRing, NVals), moves(OldRing, NewRing)]). +-endif. + +%% -- Tetsing -------------------------------------------------------------- + +-ifdef(TEST). + +%% -- Unit tests for experimentation --------------------------------------- +%% These tests take a bit of time when running. +%% Not intended to be included in automatic testing. + +known_hard_tests() -> + Tests = [ {16, [4, 3, 3, 2], 3} + , {32, [3, 2, 1, 4, 3], 3} + , {32, [5, 6, 5, 1, 1], 3} + , {128, [1, 1, 1, 1, 1, 1], 5} + , {16, [4, 4, 4, 3], 4} + , {16, [4, 4, 3, 3], 4} + , {16, [4, 3, 3, 3], 4} + , {32, [4, 3, 3, 3], 4} + , {48, [4, 3, 3, 3], 4} + , {32, [2, 2, 2, 2, 2], 4} + , {16, [2, 2, 1, 2, 2], 4} + , {16, [2, 2, 4, 2], 4} + , {16, [3, 2, 2, 2], 4} + , {32, [3, 2, 2, 2], 4} + , {32, [3, 3, 3, 1, 1], 4} + , {16, [1, 3, 2, 1, 1, 1], 4} + , {64, [2, 2, 1, 2, 2, 2], 5} + , {256, [6, 5, 2], 2} + , {64, [3, 3, 3, 2, 1], 4} + , {32, [3, 3, 3, 3, 1], 4} + , {512, [4, 4, 4, 4, 1], 4} + ], + [ {Size, Config, NVal, '->', V} + || {Size, Config, NVal} <- Tests + , V <- [violations(solve(Size, Config, NVal), NVal)] + , not ?is_zero_v(V) + ]. + +typical_scenarios_tests() -> + %% We simulate updates from fresh ring to more and more nodes and locations + NVal = 4, + Tests = [ [1] + , [2, 2, 2, 2] + , [2, 2, 2, 2, 1] + , [2, 2, 2, 2, 2] + , [2, 2, 2, 2, 2, 1] + , [2, 2, 2, 2, 2, 2] + , [3, 2, 2, 2, 2, 2] + , [3, 3, 2, 2, 2, 2] + , [3, 3, 3, 2, 2, 2] + , [3, 3, 3, 3, 2, 2] + ], + Results = + [ lists:foldl( + fun(_Config, Err={error, _}) -> + Err; + (Config, {undefined, Diffs}) -> + {solve(Size, Config, NVal), Diffs}; + (Config, {OldRing, Diffs}) -> + NewRing = update(OldRing, Config, NVal), + V = violations(NewRing, NVal), + Diff = moves(OldRing, NewRing), + if ?is_zero_v(V) -> {NewRing, Diffs ++ [Diff]}; + true -> {error, {Size, OldRing, NewRing, Config, V}} + end + end, {undefined, [0]}, Tests) + || Size <- [64, 128, 256, 512, 1024] + ], + case [ Err || {error, Err} <- Results ] of + [] -> {ok, [ Diff || {_Ring, Diff} <- Results ]}; + Errs -> {error, Errs} + end. + + +-ifdef(EQC). + +%% -- Generators ---------------------------------------------------------- + +pnode() -> {choose(1, 16), choose(1, 16)}. + +ring() -> non_empty(list(pnode())). + +nvals() -> ?LET(NVal, choose(1, 5), + ?LET(LVal, choose(1, NVal), + if NVal == LVal -> NVal; true -> {NVal, LVal} end)). + +op(N) -> + Ix = choose(0, N - 1), + ?SUCHTHAT(Op, {elements([swap, move]), Ix, Ix}, + case Op of + {swap, _, _} -> true; + {move, _, _} -> true + end). + +%% -- Properties ---------------------------------------------------------- + +prop_window() -> + ?FORALL(Nodes, ring(), + ?FORALL({Ix, NVal}, {choose(0, length(Nodes) - 1), choose(1, 5)}, + begin + Ring = from_list(Nodes), + Window = subring(Nodes, Ix - NVal + 1, 2 * NVal - 1), + equals(window(Ring, Ix, NVal), Window) + end)). + +prop_get_node() -> + ?FORALL(Nodes, ring(), + begin + Ring = from_list(Nodes), + equals([ get_node(Ring, I) || I <- lists:seq(0, ring_size(Ring) - 1) ], + Nodes) + end). + +subring(Xs, Ix, Len) when Ix < 0 -> subring(Xs ++ Xs, Ix + length(Xs), Len); +subring(Xs, Ix, Len) when Ix + Len > length(Xs) -> subring(Xs ++ Xs, Ix, Len); +subring(Xs, Ix, Len) -> lists:sublist(Xs, Ix + 1, Len). + +prop_swap_violations() -> + ?FORALL(Nodes, ring(), + ?FORALL({Op, NVals}, {op(length(Nodes)), nvals()}, + begin + Ring = from_list(Nodes), + V = violations(Ring, NVals), + {Ring1, DV} = op(Ring, NVals, Op), + V1 = violations(Ring1, NVals), + ?WHENFAIL(io:format("Original: ~s\nSwapped: ~s\nV = ~p\nV1 = ~p\nDV = ~p\n", + [show(Ring, NVals), show(Ring1, NVals), V, V1, DV]), + equals(add_v(V, DV), V1)) + end)). + +%% In legacy riak there are no locations and only NVal for nodes is +%% imoportant. Property compares that we can both implement that +%% with 1 location and location nval == 1 or each node its own +%% location. +prop_no_locations() -> + ?FORALL({Size, Nodes, NVal}, {elements([16, 32, 64, 128, 256, 512]), choose(1, 64), choose(1,5)}, + begin + {OneT, OneRing} = timer:tc(?MODULE, solve, [Size, [Nodes], {NVal, 1}]), + {_, OneViolations} = violations(OneRing, {NVal, 1}), + {SepT, SepRing} = timer:tc(?MODULE, solve, [Size, lists:duplicate(Nodes, 1), NVal]), + {_, SepViolations} = violations(SepRing, NVal), + measure(one_location, OneT, + measure(sep_location, SepT, + equals(OneViolations, SepViolations))) + end). + + +-endif. +-endif. diff --git a/src/riak_core_claim_location.erl b/src/riak_core_claim_location.erl deleted file mode 100644 index c7af5dd89..000000000 --- a/src/riak_core_claim_location.erl +++ /dev/null @@ -1,1647 +0,0 @@ -%% ------------------------------------------------------------------- -%% -%% riak_core: Core Riak Application -%% -%% This file is provided to you under the Apache License, -%% Version 2.0 (the "License"); you may not use this file -%% except in compliance with the License. You may obtain -%% a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, -%% software distributed under the License is distributed on an -%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -%% KIND, either express or implied. See the License for the -%% specific language governing permissions and limitations -%% under the License. -%% -%% ------------------------------------------------------------------- - -%% @doc choose and sequential claim functions for a more location friendly -%% claim algorithm - --module(riak_core_claim_location). - --export( - [ - choose_claim_v4/2, choose_claim_v4/3, - sequential_claim/2, sequential_claim/3, - sort_members_for_choose/3 - ]). - --type location_finder() :: fun((node()) -> atom()). - --spec sort_members_for_choose( - riak_core_ring:riak_core_ring(), - list(node()), - list({non_neg_integer(), node()})) -> - list({non_neg_integer(), node()}). -sort_members_for_choose(Ring, Members, Owners) -> - NodesLocations = riak_core_ring:get_nodes_locations(Ring), - case riak_core_location:has_location_set_in_cluster(NodesLocations) of - false -> - Members; - true -> - LocationNodesD = - riak_core_location:get_location_nodes(Members, NodesLocations), - InitSort = initial_location_sort(dict:to_list(LocationNodesD)), - lists:append(lists:subtract(InitSort, Owners), Owners) - end. - -initial_location_sort(LocationNodeList) -> - NodeLists = - sort_lists_by_length( - lists:map(fun({_L, NL}) -> NL end, LocationNodeList)), - roll_nodelists(NodeLists, []). - -roll_nodelists(NodeLists, ListOfNodes) -> - case length(hd(NodeLists)) of - L when L > 1 -> - {UpdNodeLists, UpdListOfNodes} = - lists:mapfoldl( - fun(NL, Acc) -> - case length(NL) of - L when L > 1 -> - [H|T] = NL, - {T, [H|Acc]}; - _ -> - {NL, Acc} - end - end, - ListOfNodes, - NodeLists), - roll_nodelists(UpdNodeLists, UpdListOfNodes); - 1 -> - ListOfNodes ++ lists:flatten(NodeLists) - end. - -choose_claim_v4(Ring, Node) -> - Params = riak_core_membership_claim:default_choose_params(), - choose_claim_v4(Ring, Node, Params). - --spec choose_claim_v4( - riak_core_ring:riak_core_ring(), node(), list(tuple())) -> - riak_core_ring:riak_core_ring(). -choose_claim_v4(Ring, Node, Params0) -> - Params = riak_core_membership_claim:default_choose_params(Params0), - Active = riak_core_ring:claiming_members(Ring), - Owners = riak_core_ring:all_owners(Ring), - Ownerships = riak_core_membership_claim:get_counts(Active, Owners), - RingSize = riak_core_ring:num_partitions(Ring), - NodeCount = length(Active), - {MinVnodes, MaxVnodes, Deltas} - = assess_deltas(RingSize, NodeCount, Ownerships), - {Node, CurrentOwnerships} = - lists:keyfind(Node, 1, Ownerships), - Want = MaxVnodes - CurrentOwnerships, - TargetN = proplists:get_value(target_n_val, Params), - - NodesToClaim = lists:filter(fun({_N, O}) -> O == 0 end, Ownerships), - NodesAllClaimed = - case NodesToClaim of - [{Node, _}] -> - true; - [] -> - true; - _ -> - false - end, - - ZippedIndices = - lists:zip( - lists:seq(0, length(Owners) - 1), - [Idx || {Idx, _} <- Owners] - ), - AllIndices = - sort_indices_for_claim( - ZippedIndices, length(Active), Owners, Deltas, NodesAllClaimed), - - EnoughNodes = - (NodeCount > TargetN) - or ((NodeCount == TargetN) and (RingSize rem TargetN =:= 0)), - - Indices = - case EnoughNodes of - true -> - %% If we have enough nodes to meet target_n, then we prefer to - %% claim indices that are currently causing violations, and - %% then fallback to indices in linear order. The filtering - %% steps below will ensure no new violations are introduced. - NodeViolations = find_node_violations(Ring, TargetN), - LocationViolations = - lists:subtract( - find_location_violations(Ring, TargetN), - NodeViolations), - {DirtyNodeIndices, OtherIndices} = - lists:splitwith( - fun({_Nth, Idx}) -> - lists:member(Idx, NodeViolations) - end, - AllIndices), - {DirtyLocationIndices, CleanIndices} = - lists:splitwith( - fun({_Nth, Idx}) -> - lists:member(Idx, LocationViolations) - end, - OtherIndices - ), - DirtyNodeIndices ++ DirtyLocationIndices ++ CleanIndices; - false -> - AllIndices - end, - - %% Filter out indices that conflict with the node's existing ownership - ClaimableIdxs = - prefilter_violations( - Ring, Node, AllIndices, Indices, TargetN, RingSize), - - %% Claim indices from the remaining candidate set - Claim2 = - case select_indices( - Owners, Deltas, ClaimableIdxs, TargetN, RingSize) of - [] -> - []; - Claim -> - lists:sublist(Claim, Want) - end, - NewRing = - lists:foldl( - fun(Idx, Ring0) -> - riak_core_ring:transfer_node(Idx, Node, Ring0) - end, - Ring, - Claim2), - - BadRing = length(meets_target_n(NewRing, TargetN)) > 0, - DeficientClaim = (length(Claim2) + CurrentOwnerships) < MinVnodes, - BadClaim = EnoughNodes and BadRing and NodesAllClaimed, - - MaybeBalancedRing = - case NodesAllClaimed and (MinVnodes < MaxVnodes) of - true -> - NewOwners = riak_core_ring:all_owners(NewRing), - NewOwnerships = - riak_core_membership_claim:get_counts(Active, NewOwners), - {MinVnodes, MaxVnodes, NewDeltas} - = assess_deltas(RingSize, NodeCount, NewOwnerships), - NodesToGive = - lists:filter( - fun({_N, D}) -> - case D of - D when D < (MinVnodes - MaxVnodes) -> - true; - _ -> - false - end - end, - NewDeltas), - NodesToTake = - lists:filtermap( - fun({N, D}) -> - case D of 0 -> {true, N}; _ -> false end - end, - NewDeltas), - give_partitions( - NodesToGive, NodesToTake, ZippedIndices, TargetN, NewRing); - false -> - NewRing - end, - - case BadClaim or DeficientClaim of - true -> - sequential_claim(Ring, Node, TargetN); - _ -> - MaybeBalancedRing - end. - --spec give_partitions( - list({node(), integer()}), - list(node()), - list({non_neg_integer(), non_neg_integer()}), - pos_integer(), - riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). -give_partitions([], _TakeNodes, _ZipIndices, _TargetN, Ring) -> - Ring; -give_partitions(_, [], _ZipIndices, _TargetN, Ring) -> - Ring; -give_partitions([{_Node, -1}|Rest], TakeNodes, ZipIndices, TargetN, Ring) -> - give_partitions(Rest, TakeNodes, ZipIndices, TargetN, Ring); -give_partitions([{Node, D}|Rest], TakeNodes, ZipIndices, TargetN, Ring) -> - Owners = riak_core_ring:all_owners(Ring), - Partitions = - lists:filtermap( - fun({Idx, N}) -> case N of Node -> {true, Idx}; _ -> false end end, - Owners), - {Success, ClaimableIdx, ReceivingNode} = - lists:foldl( - fun (_Idx, {true, P, RcvNode}) -> - {true, P, RcvNode}; - (Idx, {false, undefined, undefined}) -> - PotentialHomes = - find_home( - Idx, TakeNodes, ZipIndices, TargetN, Owners, Ring), - case PotentialHomes of - [] -> - {false, undefined, undefined}; - [HN|_Rest] -> - {true, Idx, HN} - end - end, - {false, undefined, undefined}, - Partitions), - case {Success, ClaimableIdx, ReceivingNode} of - {true, ClaimableIdx, ReceivingNode} -> - give_partitions( - [{Node, D + 1}|Rest], - TakeNodes -- [ReceivingNode], - ZipIndices, - TargetN, - riak_core_ring:transfer_node( - ClaimableIdx, ReceivingNode, Ring)); - {false, undefined, undefined} -> - give_partitions(Rest, TakeNodes, ZipIndices, TargetN, Ring) - end. - - -find_home(Idx, TakeNodes, ZippedIndices, TargetN, Owners, Ring) -> - {Nth, Idx} = lists:keyfind(Idx, 2, ZippedIndices), - RS = length(ZippedIndices), - OwningNodes = - lists:usort( - lists:map( - fun(N0) -> - N1 = - case N0 of - N0 when N0 < 0 -> RS + N0; - N0 when N0 >= RS -> N0 - RS; - N0 -> N0 - end, - {N1, I} = lists:keyfind(N1, 1, ZippedIndices), - {I, O} = lists:keyfind(I, 1, Owners), - O - end, - lists:seq((Nth + 1) - TargetN, Nth + TargetN - 1) -- [Nth]) - ), - NodesLocations = riak_core_ring:get_nodes_locations(Ring), - case riak_core_location:has_location_set_in_cluster(NodesLocations) of - false -> - TakeNodes -- OwningNodes; - true -> - Locations = - lists:usort( - lists:map( - fun(N) -> - riak_core_location:get_node_location( - N, NodesLocations) - end, - OwningNodes)), - lists:filter( - fun(TN0) -> - not lists:member( - riak_core_location:get_node_location( - TN0, NodesLocations), - Locations) - end, - TakeNodes) - end. - - --spec sort_indices_for_claim( - list({non_neg_integer(), non_neg_integer()}), - pos_integer(), - [{non_neg_integer(), node()}], - [{node(), integer()}], - boolean()) -> list({non_neg_integer(), non_neg_integer()}). -sort_indices_for_claim( - ZippedIndices, ActiveMemberCount, Owners, Deltas, _NodesAllClaimed) -> - StripeCount = max(1, (ActiveMemberCount - 1)), - StripeList = - lists:map( - fun({Nth, I}) -> {Nth rem StripeCount, Nth, I} end, - ZippedIndices), - Counter = - dict:from_list( - lists:map(fun(I) -> {I, 0} end, lists:seq(0, StripeCount - 1))), - Counted = - lists:foldl( - fun({R, _Nth, _I}, C) -> dict:update_counter(R, 1, C) end, - Counter, - StripeList), - lists:map( - fun({_OD, _RC, _R, Nth, I}) -> {Nth, I} end, - lists:sort( - lists:map( - fun({R, Nth, I}) -> - {I, Owner} = lists:keyfind(I, 1, Owners), - {Owner, Delta} = lists:keyfind(Owner, 1, Deltas), - {Delta, dict:fetch(R, Counted), R, Nth, I} - end, - lists:reverse(StripeList) - ))). - -%% @doc -%% Assess what the minimum and maximum number of vnodes which should be owned -%% by each node, and return a list of nodes with the Deltas from the minimum -%% i.e. where a node has more vnodes than the minimum the delta will be a -%% negative number indicating the number of vnodes it can offer to a node with -%% wants. --spec assess_deltas( - pos_integer(), pos_integer(), [{node(), non_neg_integer()}]) -> - {non_neg_integer(), pos_integer(), [{node(), integer()}]}. -assess_deltas(RingSize, NodeCount, Ownerships) -> - MinVnodes = RingSize div NodeCount, - MaxVnodes = - case RingSize rem NodeCount of - 0 -> - MinVnodes; - _ -> - MinVnodes + 1 - end, - Deltas = - lists:map(fun({N, VNs}) -> {N, MinVnodes - VNs} end, Ownerships), - {MinVnodes, MaxVnodes, Deltas}. - - -%% @private -%% -%% @doc Filter out candidate indices that would violate target_n given -%% a node's current partition ownership. Only interested in indices which -%% are not currently owned within a location --spec prefilter_violations( - riak_core_ring:riak_core_ring(), - node(), - list({non_neg_integer(), non_neg_integer()}), - list({non_neg_integer(), non_neg_integer()}), - pos_integer(), - pos_integer()) -> list({non_neg_integer(), non_neg_integer()}). -prefilter_violations(Ring, Node, AllIndices, Indices, TargetN, RingSize) -> - CurrentIndices = - indices_nth_subset(AllIndices, riak_core_ring:indices(Ring, Node)), - case riak_core_location:support_locations_claim(Ring, TargetN) of - true -> - OtherLocalNodes = - riak_core_location:local_nodes(Ring, Node), - LocalIndices = - indices_nth_subset( - AllIndices, - lists:flatten( - lists:map( - fun(N) -> riak_core_ring:indices(Ring, N) end, - [Node|OtherLocalNodes]))), - SafeRemoteIndices = - safe_indices( - lists:subtract(Indices, LocalIndices), - LocalIndices, TargetN, RingSize), - SafeLocalIndices = - safe_indices( - lists:subtract( - lists:filter( - fun(NthIdx) -> lists:member(NthIdx, Indices) end, - LocalIndices), - CurrentIndices), - CurrentIndices, TargetN, RingSize), - SafeRemoteIndices ++ SafeLocalIndices; - false -> - safe_indices( - lists:subtract(AllIndices, CurrentIndices), - CurrentIndices, TargetN, RingSize) - end. - --spec indices_nth_subset( - list({non_neg_integer(), non_neg_integer()}), - list(non_neg_integer())) -> - list({non_neg_integer(), non_neg_integer()}). -indices_nth_subset(IndicesNth, Indices) -> - lists:filter(fun({_N, Idx}) -> lists:member(Idx, Indices) end, IndicesNth). - --spec safe_indices( - list({non_neg_integer(), non_neg_integer()}), - list({non_neg_integer(), non_neg_integer()}), - pos_integer(), - pos_integer()) -> - list({non_neg_integer(), non_neg_integer()}). -safe_indices( - IndicesToCheck, LocalIndicesToAvoid, TargetN, RingSize) -> - lists:filter( - fun({Nth, _Idx}) -> - lists:all( - fun({CNth, _}) -> - riak_core_membership_claim:spaced_by_n( - CNth, Nth, TargetN, RingSize) - end, - LocalIndicesToAvoid) - end, - IndicesToCheck - ). - - --spec meets_target_n( - riak_core_ring:riak_core_ring(), pos_integer()) -> - list({non_neg_integer(), node(), list(node())}). -meets_target_n(Ring, TargetN) when TargetN > 1 -> - {_RingSize, Mappings} = riak_core_ring:chash(Ring), - Prefix = lists:sublist(Mappings, TargetN - 1), - CheckableMap = Mappings ++ Prefix, - {_, Failures} = - lists:foldl( - fun({Idx, N}, {LastNminus1, Fails}) -> - case lists:member(N, LastNminus1) of - false -> - {[N|lists:sublist(LastNminus1, TargetN - 2)], Fails}; - true -> - {[N|lists:sublist(LastNminus1, TargetN - 2)], - [{Idx, N, LastNminus1}|Fails]} - end - end, - {[], []}, - CheckableMap), - Failures; -meets_target_n(_Ring, _TargetN) -> - true. - -%% @private -%% -%% @doc Select indices from a given candidate set, according to two -%% goals. -%% -%% 1. Ensure greedy/local target_n spacing between indices. Note that this -%% goal intentionally does not reject overall target_n violations. -%% -%% 2. Select indices based on the delta between current ownership and -%% expected ownership. In other words, if A owns 5 partitions and -%% the desired ownership is 3, then we try to claim at most 2 partitions -%% from A. -select_indices(_Owners, _Deltas, [], _TargetN, _RingSize) -> - []; -select_indices(Owners, Deltas, Indices, TargetN, RingSize) -> - OwnerDT = dict:from_list(Owners), - %% Claim partitions and check that subsequent partitions claimed by this - %% node do not break the target_n invariant. - {Claims, _NClaims, _Deltas} = - lists:foldl( - fun({Nth, Idx}, {IdxClaims, NthClaims, DeltaDT}) -> - Owner = dict:fetch(Idx, OwnerDT), - Delta = dict:fetch(Owner, DeltaDT), - MeetsTN = - lists:all( - fun(ClaimedNth) -> - riak_core_membership_claim:spaced_by_n( - ClaimedNth, Nth, TargetN, RingSize) - end, - NthClaims), - case (Delta < 0) and MeetsTN of - true -> - NextDeltaDT = - dict:update_counter(Owner, 1, DeltaDT), - {[Idx|IdxClaims], [Nth|NthClaims], NextDeltaDT}; - false -> - {IdxClaims, NthClaims, DeltaDT} - end - end, - {[], [], dict:from_list(Deltas)}, - Indices), - lists:reverse(Claims). - - -%% @private -%% -%% @doc Determines indices that violate the given target_n spacing -%% property. --spec find_node_violations( - riak_core_ring:riak_core_ring(), pos_integer()) - -> list(non_neg_integer()). -find_node_violations(Ring, TargetN) -> - Owners = riak_core_ring:all_owners(Ring), - find_violations(Owners, TargetN). - --spec find_location_violations( - riak_core_ring:riak_core_ring(), pos_integer()) - -> list(non_neg_integer()). -find_location_violations(Ring, TargetN) -> - case riak_core_location:support_locations_claim(Ring, TargetN) of - true -> - find_violations( - riak_core_location:get_location_owners(Ring), TargetN); - false -> - [] - end. - --spec find_violations( - list({non_neg_integer(), atom()}), pos_integer()) - -> list(non_neg_integer()). -find_violations(Owners, TargetN) -> - Suffix = lists:sublist(Owners, TargetN - 1), - %% Add owners at the front to the tail, to confirm no tail violations - OwnersWithTail = Owners ++ Suffix, - %% Use a sliding window to determine violations - {Bad, _} = - lists:foldl( - fun(P={Idx, Owner}, {Out, Window}) -> - Window2 = lists:sublist([P|Window], TargetN-1), - case lists:keyfind(Owner, 2, Window) of - {_PrevIdx, Owner} -> - {[Idx | Out], Window2}; - false -> - {Out, Window2} - end - end, - {[], lists:sublist(Owners, 2, TargetN - 1)}, - OwnersWithTail), - lists:usort(Bad). - --spec sequential_claim( - riak_core_ring:riak_core_ring(), node()) -> - riak_core_ring:riak_core_ring(). -sequential_claim(Ring, Node) -> - TN = riak_core_membership_claim:get_target_n(), - sequential_claim(Ring, Node, TN). - -%% @private fall back to diagonal striping vnodes across nodes in a -%% sequential round robin (eg n1 | n2 | n3 | n4 | n5 | n1 | n2 | n3 -%% etc) However, different to `claim_rebalance_n', this function -%% attempts to eliminate tail violations (for example a ring that -%% starts/ends n1 | n2 | ...| n3 | n4 | n1) --spec sequential_claim( - riak_core_ring:riak_core_ring(), node(), integer()) -> - riak_core_ring:riak_core_ring(). -sequential_claim(Ring, Node, TargetN) -> - OrigNodes = lists:usort([Node|riak_core_ring:claiming_members(Ring)]), - Nodes = get_nodes_by_location(OrigNodes, Ring), - NodeCount = length(Nodes), - RingSize = riak_core_ring:num_partitions(Ring), - - Overhang = RingSize rem NodeCount, - HasTailViolation = (Overhang > 0 andalso Overhang < TargetN), - Shortfall = TargetN - Overhang, - SolveableNodeViolation = - solveable_violation(RingSize, NodeCount, TargetN, Shortfall) - and HasTailViolation, - - LocationsSupported = - riak_core_location:support_locations_claim(Ring, TargetN), - {SolveableLocationViolation, LocationShortfall} = - case {LocationsSupported, Overhang, RingSize div NodeCount} of - {true, OH, Loops} when OH > 0, Loops >= 1 -> - MinDistance = - check_for_location_tail_violation( - Nodes, Ring, OH, TargetN), - case MinDistance of - MD when MD =< TargetN -> - SLV = - solveable_violation( - RingSize, NodeCount, TargetN, TargetN - MD), - {SLV, TargetN - MD}; - _ -> - {false, 0} - end; - _NotSolveable -> - {false, 0} - end, - - Partitions = lists:sort([ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), - Zipped = - case {SolveableLocationViolation, SolveableNodeViolation} of - {true, _} -> - F = location_finder(Ring), - Nodelist = - solve_tail_violations( - RingSize, Nodes, LocationShortfall, TargetN, true, F), - lists:zip(Partitions, Nodelist); - {_, true} -> - Nodelist = - solve_tail_violations( - RingSize, Nodes, Shortfall, TargetN, false, undefined), - lists:zip(Partitions, Nodelist); - _ -> - riak_core_membership_claim:diagonal_stripe(Ring, Nodes) - end, - - lists:foldl( - fun({P, N}, Acc) -> riak_core_ring:transfer_node(P, N, Acc) end, - Ring, - Zipped). - --spec location_finder(riak_core_ring:riak_core_ring()) -> location_finder(). -location_finder(Ring) -> - LocationD = riak_core_ring:get_nodes_locations(Ring), - fun(N) -> - riak_core_location:get_node_location(N, LocationD) - end. - --spec check_for_location_tail_violation( - list(node()), - riak_core_ring:riak_core_ring(), - pos_integer(), - pos_integer()) -> pos_integer(). -check_for_location_tail_violation(Nodes, Ring, OH, TargetN) -> - {LastLoop, ExtraNodes} = lists:split(OH, Nodes), - LastNodes = - lists:reverse( - lists:sublist( - lists:reverse(ExtraNodes ++ LastLoop), TargetN - 1)), - FirstNodes = lists:sublist(Nodes, TargetN - 1), - LocationFinder = location_finder(Ring), - LastLocations = lists:map(LocationFinder, LastNodes), - FirstLocations = - lists:zip( - lists:map(LocationFinder, FirstNodes), - lists:seq(1, TargetN - 1)), - {MinDistance, _} = - lists:foldl( - fun(L, {MinStep, TailStep}) -> - case lists:keyfind(L, 1, FirstLocations) of - {L, N} -> - {min(TailStep + N, MinStep), TailStep - 1}; - false -> - {MinStep, TailStep - 1} - end - end, - {TargetN, TargetN - 2}, - LastLocations), - MinDistance. - - --spec solveable_violation( - pos_integer(), pos_integer(), pos_integer(), pos_integer()) -> boolean(). -solveable_violation(RingSize, NodeCount, TargetN, Shortfall) -> - case RingSize div NodeCount of - LoopCount when LoopCount >= Shortfall -> - true; - LoopCount -> - SplitSize = Shortfall div LoopCount, - BiggestTake = Shortfall - ((LoopCount - 1) * SplitSize), - (NodeCount - BiggestTake) >= TargetN - end. - -%% @doc -%% The node list must be of length ring size. It is made up of a set of -%% complete loops of the node list, and then a partial loop with the addition -%% of the shortfall. The for each node in the shortfall a node in the complete -%% loops must be removed --spec solve_tail_violations( - pos_integer(), - [node()], - non_neg_integer(), - pos_integer(), - boolean(), - undefined|location_finder()) -> - [[node()]]. -solve_tail_violations( - RingSize, Nodes, Shortfall, _TargetN, false, _LocFinder) -> - {LastLoop, Remainder} = - lists:split(RingSize rem length(Nodes), Nodes), - ExcessLoop = lists:sublist(Remainder, Shortfall), - Tail = LastLoop ++ ExcessLoop, - LoopCount = RingSize div length(Nodes), - RemoveList = - divide_list_for_removes(lists:reverse(ExcessLoop), LoopCount), - CompleteLoops = - lists:append( - lists:duplicate(LoopCount - length(RemoveList), Nodes)), - PartialLoops = - lists:map( - fun(ENL) -> lists:subtract(Nodes, ENL) end, - RemoveList), - CompleteLoops ++ lists:append(PartialLoops) ++ Tail; -solve_tail_violations( - RingSize, Nodes, Shortfall, TargetN, true, LocFinder) -> - {LastLoop, Remainder} = - lists:split(RingSize rem length(Nodes), Nodes), - PostLoop = lists:sublist(Nodes, TargetN - 1), - PreExcess = - lists:reverse( - lists:sublist( - lists:reverse(Nodes ++ LastLoop), TargetN - 1)), - {SafeList, SafeAdditions} = - case safe_to_remove(Nodes, LastLoop, TargetN, LocFinder) of - SL when length(SL) >= Shortfall -> - {lists:sublist(SL, Shortfall), Remainder}; - SL -> - RemovableExcess = - safe_to_remove( - Nodes, Remainder, TargetN, LocFinder), - {SL, RemovableExcess} - end, - ExcessLoop = - case length(SafeAdditions) of - NodesToCheck when NodesToCheck >= Shortfall -> - safe_to_add( - PreExcess, PostLoop, SafeAdditions, LocFinder, Shortfall); - NodesToCheck -> - CheckList = - SafeAdditions ++ - lists:sublist( - lists:subtract(Remainder, SafeAdditions), - Shortfall - NodesToCheck), - safe_to_add( - PreExcess, PostLoop, CheckList, LocFinder, Shortfall) - end, - - Tail = LastLoop ++ ExcessLoop, - LoopCount = RingSize div length(Nodes), - RemoveCount = length(ExcessLoop), - UpdSafeList = - SafeList ++ - lists:filter( - fun(N) -> lists:member(N, ExcessLoop) end, SafeAdditions) ++ - (ExcessLoop -- SafeAdditions), - - RemoveList = - divide_list_for_removes( - lists:sublist(UpdSafeList, RemoveCount), LoopCount), - RemoveLoops = length(RemoveList), - - case LoopCount > (2 * RemoveLoops) of - true -> - PartialLoops = - lists:map( - fun(ENL) -> lists:subtract(Nodes, ENL) ++ Nodes end, - RemoveList), - CompleteLoops = - lists:flatten( - lists:duplicate(LoopCount - (2 * RemoveLoops), Nodes)), - CompleteLoops ++ lists:append(PartialLoops) ++ Tail; - false -> - CompleteLoops = - lists:flatten( - lists:duplicate(LoopCount - RemoveLoops, Nodes)), - PartialLoops = - lists:map( - fun(ENL) -> lists:subtract(Nodes, ENL) end, - RemoveList), - CompleteLoops ++ lists:append(PartialLoops) ++ Tail - end. - - --spec safe_to_add( - list(node()), - list(node()), - list(node()), - location_finder()|undefined, - pos_integer()) -> list(node()). -safe_to_add(PreExcess, PostLoop, NodesToCheck, LocFinder, Shortfall) -> - NodePositions = - score_for_adding( - lists:zip( - lists:map(LocFinder, lists:reverse(PreExcess)), - lists:seq(1, length(PreExcess))), - lists:zip( - lists:map(LocFinder, PostLoop), - lists:seq(1, length(PostLoop))), - lists:map(LocFinder, NodesToCheck), - [], - Shortfall), - PositionsByNode = lists:zip(NodePositions, NodesToCheck), - Positions = lists:seq(1, Shortfall), - case choose_positions(Positions, PositionsByNode, [], {[], LocFinder}) of - fail -> - lists:sublist(NodesToCheck, Shortfall); - NodeList -> - lists:reverse(NodeList) - end. - -choose_positions([], _PositionsByNode, NodeList, _LocationCheck) -> - NodeList; -choose_positions([Pos|RestPos], PositionsByNode, NodeList, {LocList, LocF}) -> - SortedPositionsByNode = - lists:filter( - fun({PL, _N}) -> length(PL) > 0 end, - lists:sort(PositionsByNode)), - case SortedPositionsByNode of - [{TopPL, TopN}|RestPBN] -> - TopL = LocF(TopN), - case {lists:member(Pos, TopPL), lists:member(TopL, LocList)} of - {true, false} -> - choose_positions( - RestPos, - lists:map( - fun({PL, N}) -> {PL -- [Pos], N} end, - RestPBN), - [TopN|NodeList], - {[TopL|LocList], LocF}); - {true, true} -> - choose_positions( - [Pos|RestPos], - RestPBN, - NodeList, - {LocList, LocF}); - _ -> - fail - end; - _ -> - fail - end. - - --spec score_for_adding( - list({node()|atom(), pos_integer()}), - list({node()|atom(), pos_integer()}), - list(node()|atom()), - list(list(pos_integer())), - pos_integer()) -> - list(list(pos_integer())). -score_for_adding(_PreExcess, _PostLoop, [], NodePositions, _Shortfall) -> - lists:reverse(NodePositions); -score_for_adding(PreExcess, PostLoop, [HD|Rest], NodePositions, Shortfall) -> - BackPositions = - case lists:keyfind(HD, 1, PreExcess) of - {HD, BS} -> - lists:filter( - fun(P) -> - {A, B} = {(P + BS - 1), length(PreExcess)}, - A > B - end, - lists:seq(1, Shortfall) - ); - false -> - lists:seq(1, Shortfall) - end, - ForwardPositions = - case lists:keyfind(HD, 1, PostLoop) of - {HD, FS} -> - lists:filter( - fun(P) -> - {A, B} = {(FS + Shortfall - P), length(PostLoop)}, - A > B - end, - lists:seq(1, Shortfall)); - false -> - lists:seq(1, Shortfall) - end, - SupportedPositions = - lists:filter( - fun(BP) -> lists:member(BP, ForwardPositions) end, BackPositions), - score_for_adding( - PreExcess, - PostLoop, - Rest, - [SupportedPositions|NodePositions], - Shortfall). - - --spec safe_to_remove( - list(node()), - list(node()), - pos_integer(), - location_finder()|undefined) -> list(node()). -safe_to_remove(Nodes, NodesToCheck, TargetN, LocFinder) -> - LocationFinder = fun(N) -> {N, LocFinder(N)} end, - safe_to_remove_loop( - lists:map(LocationFinder, Nodes), - lists:map(LocationFinder, NodesToCheck), - [], - TargetN). - -safe_to_remove_loop(_Nodes, [], SafeList, _TargetN) -> - SafeList; -safe_to_remove_loop(Nodes, [HD|Rest], SafeList, TargetN) -> - WrappedNodes = (Nodes -- [HD]) ++ lists:sublist(Nodes, 1, TargetN), - {Node, _Location} = HD, - CheckFun = - fun({_N, L}, CheckList) -> - case lists:keyfind(L, 2, CheckList) of - false -> - false; - _ -> - true - end - end, - IsSafe = - lists:foldl( - fun(N, Acc) -> - case Acc of - fail -> - fail; - LastNminus1 when is_list(LastNminus1) -> - case CheckFun(N, LastNminus1) of - false -> - [N|lists:sublist(LastNminus1, TargetN - 2)]; - true -> - fail - end - end - end, - [], - WrappedNodes), - case IsSafe of - fail -> - safe_to_remove_loop(Nodes, Rest, SafeList, TargetN); - _ -> - safe_to_remove_loop(Nodes, Rest, [Node|SafeList], TargetN) - end. - - -%% @doc -%% Normally need to remove one of the excess nodes each loop around the node -%% list. However, if there are not enough loops, more than one can be removed -%% per loop - assuming the solveable_violation/4 condition passes (i.e. this -%% will not breach the TargetN). --spec divide_list_for_removes(list(node()), pos_integer()) - -> list(list(node())). -divide_list_for_removes(Excess, LoopCount) when LoopCount >= length(Excess) -> - lists:map(fun(N) -> [N] end, Excess); -divide_list_for_removes(Excess, 1) -> - [Excess]; -divide_list_for_removes(Excess, LoopCount) -> - FetchesPerLoop = length(Excess) div LoopCount, - LastFetch = length(Excess) - FetchesPerLoop * (LoopCount - 1), - {[], GroupedFetches} = - lists:foldl( - fun(FC, {ENs, GroupedENs}) -> - {NextGroup, Remainder} = lists:split(FC, ENs), - {Remainder, GroupedENs ++ [NextGroup]} - end, - {Excess, []}, - lists:duplicate(LoopCount - 1, FetchesPerLoop) ++ [LastFetch] - ), - GroupedFetches. - -%% @private -%% Get active nodes ordered by taking location parameters into account --spec get_nodes_by_location( - [node()|undefined], riak_core_ring:riak_core_ring()) -> [node()|undefined]. -get_nodes_by_location(Nodes, Ring) -> - NodesLocations = riak_core_ring:get_nodes_locations(Ring), - case riak_core_location:has_location_set_in_cluster(NodesLocations) of - false -> - Nodes; - true -> - LocationNodesD = - riak_core_location:get_location_nodes(Nodes, NodesLocations), - stripe_nodes_by_location(LocationNodesD) - end. - --spec stripe_nodes_by_location(dict:dict()) -> list(node()|undefined). -stripe_nodes_by_location(NodesByLocation) -> - [LNodes|RestLNodes] = - sort_lists_by_length( - lists:map(fun({_L, NL}) -> NL end, dict:to_list(NodesByLocation))), - stripe_nodes_by_location( - RestLNodes, - lists:map( - fun({I, L}) -> {1, I, L} end, - lists:zip( - lists:seq(1, length(LNodes)), - lists:map(fun(N) -> [N] end, LNodes)))). - -stripe_nodes_by_location([], Acc) -> - lists:flatten( - lists:map(fun({_L, _I, NL}) -> NL end, lists:sort(Acc)) - ); -stripe_nodes_by_location([LNodes|OtherLNodes], Acc) -> - SortedAcc = lists:sort(Acc), - {UpdatedAcc, []} = - lists:mapfoldl( - fun({L, I, NodeList}, LocationNodesToAdd) -> - case LocationNodesToAdd of - [NodeToAdd|TailNodes] -> - {{L + 1, I, NodeList ++ [NodeToAdd]}, TailNodes}; - [] -> - {{L, I, NodeList}, []} - end - end, - LNodes, - SortedAcc), - stripe_nodes_by_location(OtherLNodes, UpdatedAcc). - - -sort_lists_by_length(ListOfLists) -> - lists:sort(fun(L1, L2) -> length(L1) >= length(L2) end, ListOfLists). - - -%% =================================================================== -%% eunit tests -%% =================================================================== - --ifdef(TEST). - --include_lib("eunit/include/eunit.hrl"). - -choose_positions_test() -> - NodePositions = [{[1,2],l4n5},{[1,2],l5n5},{[1,2],l6n5},{[],l1n6}], - Positions = [1, 2], - LocF = fun(N) -> list_to_atom(lists:sublist(atom_to_list(N), 2)) end, - ?assertMatch( - [l4n5, l5n5], - lists:reverse( - choose_positions(Positions, NodePositions, [], {[], LocF}))). - - -score_for_adding_test() -> - PreExcess = [n2, n3, n4], - PostLoop = [n1, n2, n3], - PE = lists:zip(lists:reverse(PreExcess), lists:seq(1, length(PreExcess))), - PL = lists:zip(PostLoop, lists:seq(1, length(PostLoop))), - Candidates = [n1, n4, n5, n6, n7], - Shortfall = 4, - ExpectedResult = - [[1], [4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - ActualResult = - score_for_adding(PE, PL, Candidates, [], Shortfall), - ?assertMatch(ExpectedResult, ActualResult). - -simple_cluster_t1_test() -> - RingSize = 32, - TargetN = 4, - NodeList = [n1, n2, n3, n4, n5, n6], - R0 = riak_core_ring:fresh(RingSize, n1), - R1 = - lists:foldl( - fun(N, AccR) -> riak_core_ring:add_member(n1, AccR, N) end, - R0, - NodeList -- [n1]), - Props = [{target_n_val, TargetN}], - RClaim = - riak_core_membership_claim:claim( - R1, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Props}), - Failures = meets_target_n(RClaim, TargetN), - lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), - ?assert(length(Failures) == 0). - -sort_list_t1_test() -> - OtherLoc = - [[l2n1, l2n2], [l3n1, l3n2], [l4n1, l4n2], [l5n1, l5n2], - [l6n1], [l7n1], [l8n1]], - FirstLoc = [{1, 1, [l1n1]}, {1, 2, [l1n2]}], - NodeList = stripe_nodes_by_location(OtherLoc, FirstLoc), - ExpectedNodeList = - [l1n2, l2n2, l3n2, l4n2, l5n2, l7n1, - l1n1, l2n1, l3n1, l4n1, l5n1, l6n1, l8n1], - ?assertMatch( - ExpectedNodeList, NodeList - ). - -prefilter_violations_test_() -> - % Be strict on test timeout. Unrefined code took > 10s, whereas the - % refactored code should be << 1s. - {timeout, 5, fun prefilter_violations_perf/0}. - -prefilter_violations_perf() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, - {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, - {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, - {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, - {l7n1, loc7}, {l7n2, loc7}], - N1 = l1n1, - N1Loc = loc1, - RingSize = 4096, - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|JoiningNodes], RingSize] - ), - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), - - RAll = - lists:foldl( - fun({N, L}, AccR) -> - AccR0 = riak_core_ring:add_member(N1, AccR, N), - riak_core_ring:set_node_location(N, L, AccR0) - end, - R1, - JoiningNodes - ), - Owners = riak_core_ring:all_owners(RAll), - AllIndices = - lists:zip( - lists:seq(0, length(Owners)-1), [Idx || {Idx, _} <- Owners]), - - {T0, FilteredIndices0} = - timer:tc( - fun prefilter_violations/6, - [RAll, l1n2, AllIndices, AllIndices, 4, RingSize]), - io:format("Prefilter violations took ~w ms~n", [T0 div 1000]), - ?assertMatch(RingSize, length(FilteredIndices0)), - - {T1, FilteredIndices1} = - timer:tc( - fun prefilter_violations/6, - [RAll, l2n3, AllIndices, AllIndices, 4, RingSize]), - io:format("Prefilter violations took ~w ms~n", [T1 div 1000]), - ?assertMatch(RingSize, length(FilteredIndices1)), - - RTrans = riak_core_ring:transfer_node(0, l2n3, RAll), - {T2, FilteredIndices2} = - timer:tc( - fun prefilter_violations/6, - [RTrans, l2n3, AllIndices, AllIndices, 4, RingSize]), - io:format("Prefilter violations took ~w ms~n", [T2 div 1000]), - ?assertMatch(RingSize, length(FilteredIndices2) + 7), - - {T3, FilteredIndices3} = - timer:tc( - fun prefilter_violations/6, - [RTrans, l1n2, AllIndices, AllIndices, 4, RingSize]), - io:format("Prefilter violations took ~w ms~n", [T3 div 1000]), - io:format("Filtered instances ~w~n", [AllIndices -- FilteredIndices3]), - ?assertMatch(RingSize, length(FilteredIndices3) + 1), - - {T4, FilteredIndices4} = - timer:tc( - fun prefilter_violations/6, - [RTrans, l2n4, AllIndices, AllIndices, 4, RingSize]), - io:format("Prefilter violations took ~w ms~n", [T4 div 1000]), - ?assertMatch(RingSize, length(FilteredIndices4) + 7 - 1). - -location_seqclaim_t1_test() -> - JoiningNodes = - [{n2, loc1}, - {n3, loc2}, {n4, loc2}, - {n5, loc3}, {n6, loc3}, - {n7, loc4}, {n8, loc4}, - {n9, loc5}, {n10, loc5} - ], - location_claim_tester(n1, loc1, JoiningNodes, 64), - location_claim_tester(n1, loc1, JoiningNodes, 128), - location_claim_tester(n1, loc1, JoiningNodes, 256), - location_claim_tester(n1, loc1, JoiningNodes, 512), - location_claim_tester(n1, loc1, JoiningNodes, 1024), - location_claim_tester(n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t2_test() -> - JoiningNodes = - [{n2, loc1}, - {n3, loc2}, {n4, loc2}, - {n5, loc3}, {n6, loc3}, - {n7, loc4}, {n8, loc4} - ], - location_claim_tester(n1, loc1, JoiningNodes, 64), - location_claim_tester(n1, loc1, JoiningNodes, 128), - location_claim_tester(n1, loc1, JoiningNodes, 256), - location_claim_tester(n1, loc1, JoiningNodes, 512), - location_claim_tester(n1, loc1, JoiningNodes, 1024), - location_claim_tester(n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t3_test() -> - JoiningNodes = - [{n2, loc1}, - {n3, loc2}, {n4, loc2}, - {n5, loc3}, {n6, loc3}, - {n7, loc4}, {n8, loc4}, - {n9, loc5}, {n10, loc5}, - {n11, loc6}, {n12, loc7}, {n13, loc8} - ], - location_claim_tester(n1, loc1, JoiningNodes, 64), - location_claim_tester(n1, loc1, JoiningNodes, 128), - location_claim_tester(n1, loc1, JoiningNodes, 256), - location_claim_tester(n1, loc1, JoiningNodes, 512), - location_claim_tester(n1, loc1, JoiningNodes, 1024), - location_claim_tester(n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t4_test() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, - {l1n5, loc1}, {l1n6, loc1}, {l1n7, loc1}, {l1n8, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, - {l2n5, loc2}, {l2n6, loc2}, {l2n7, loc2}, {l2n8, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, - {l3n5, loc3}, {l3n6, loc3}, {l3n7, loc3}, {l3n8, loc3}, - {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, - {l4n5, loc4}, {l4n6, loc4}, {l4n7, loc4}, {l4n8, loc4}, - {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, {l5n4, loc5}, - {l5n5, loc5}, {l5n6, loc5}, {l5n7, loc5}, - {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, - {l6n5, loc6}, {l6n6, loc6}, {l6n7, loc6}, - {l7n1, loc7}, {l7n2, loc7}, {l7n3, loc7}], - location_claim_tester(l1n1, loc1, JoiningNodes, 64), - location_claim_tester(l1n1, loc1, JoiningNodes, 128), - location_claim_tester(l1n1, loc1, JoiningNodes, 256), - location_claim_tester(l1n1, loc1, JoiningNodes, 512), - location_claim_tester(l1n1, loc1, JoiningNodes, 1024), - location_claim_tester(l1n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t5_test() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, - {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, - {l5n1, loc5}, {l5n2, loc5}, {l5n3, loc5}, - {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, - {l7n1, loc7}, {l7n2, loc7}], - location_claim_tester(l1n1, loc1, JoiningNodes, 128), - location_claim_tester(l1n1, loc1, JoiningNodes, 256), - location_claim_tester(l1n1, loc1, JoiningNodes, 512), - location_claim_tester(l1n1, loc1, JoiningNodes, 1024), - location_claim_tester(l1n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t6_test() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, - {l1n5, loc1}, {l1n6, loc1}, {l1n7, loc1}, {l1n8, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l2n4, loc2}, - {l2n5, loc2}, {l2n6, loc2}, {l2n7, loc2}, {l2n8, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l3n4, loc3}, - {l3n5, loc3}, {l3n6, loc3}, {l3n7, loc3}, {l3n8, loc3}, - {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}, {l4n4, loc4}, - {l4n5, loc4}, {l4n6, loc4}, {l4n7, loc4}, {l4n8, loc4}, - {l5n1, loc5}, {l5n2, loc5}, - {l6n1, loc6}, {l6n2, loc6}, {l6n3, loc6}, {l6n4, loc6}, - {l6n5, loc6}, {l6n6, loc6}, {l6n7, loc6}, {l6n8, loc8}], - location_claim_tester(l1n1, loc1, JoiningNodes, 256), - location_claim_tester(l1n1, loc1, JoiningNodes, 512), - location_claim_tester(l1n1, loc1, JoiningNodes, 1024), - location_claim_tester(l1n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t7_test() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, - {l4n1, loc4}, {l4n2, loc4}, - {l5n1, loc5}, {l5n2, loc5}, - {l6n1, loc6}, {l6n2, loc6}], - location_claim_tester(l1n1, loc1, JoiningNodes, 256), - location_claim_tester(l1n1, loc1, JoiningNodes, 512), - location_claim_tester(l1n1, loc1, JoiningNodes, 1024), - location_claim_tester(l1n1, loc1, JoiningNodes, 2048). - -location_seqclaim_t8_test() -> - JoiningNodes = - [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, - {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, - {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, - {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}], - location_claim_tester(l1n1, loc1, JoiningNodes, 256, sequential_claim, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 512, sequential_claim, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 1024, sequential_claim, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 2048, sequential_claim, 3). - -location_claim_tester(N1, N1Loc, NodeLocList, RingSize) -> - location_claim_tester( - N1, N1Loc, NodeLocList, RingSize, sequential_claim, 4). - -location_claim_tester( - N1, N1Loc, NodeLocList, RingSize, ClaimFun, TargetN) -> - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|NodeLocList], RingSize] - ), - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), - - RAll = - lists:foldl( - fun({N, L}, AccR) -> - AccR0 = riak_core_ring:add_member(N1, AccR, N), - riak_core_ring:set_node_location(N, L, AccR0) - end, - R1, - NodeLocList - ), - Params = - case ClaimFun of - sequential_claim -> - TargetN; - choose_claim_v4 -> - [{target_n_val, 3}] - end, - RClaim = - riak_core_membership_claim:claim( - RAll, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, ClaimFun, Params}), - {RingSize, Mappings} = riak_core_ring:chash(RClaim), - - check_for_failures(Mappings, TargetN, RClaim). - - -check_for_failures(Mappings, TargetN, RClaim) -> - NLs = riak_core_ring:get_nodes_locations(RClaim), - LocationMap = - lists:map( - fun({Idx, N}) -> - {Idx, riak_core_location:get_node_location(N, NLs)} - end, - Mappings), - Prefix = lists:sublist(LocationMap, 3), - CheckableMap = LocationMap ++ Prefix, - {_, Failures} = - lists:foldl( - fun({Idx, L}, {LastNminus1, Fails}) -> - case lists:member(L, LastNminus1) of - false -> - {[L|lists:sublist(LastNminus1, TargetN - 2)], Fails}; - true -> - {[L|lists:sublist(LastNminus1, TargetN - 2)], - [{Idx, L, LastNminus1}|Fails]} - end - end, - {[], []}, - CheckableMap - ), - lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), - ?assert(length(Failures) == 0). - - -location_multistage_t1_test_() -> - {timeout, 60, fun location_multistage_t1_tester/0}. - -location_multistage_t2_test_() -> - {timeout, 60, fun location_multistage_t2_tester/0}. - -location_multistage_t3_test_() -> - {timeout, 60, fun location_multistage_t3_tester/0}. - -location_multistage_t1_tester() -> - %% This is a tricky corner case where we would fail to meet TargetN for - %% locations if joining all 9 nodes in one claim (as sequential_claim will - %% not succeed). However, If we join 8 nodes, then add the 9th, TargetN - %% is always achieved - JoiningNodes = - [{l1n2, loc1}, - {l2n3, loc2}, {l2n4, loc2}, - {l3n5, loc3}, {l3n6, loc3}, - {l4n7, loc4}, {l4n8, loc4} - ], - location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4). - -location_multistage_t2_tester() -> - %% This is a tricky corner case as with location_multistage_t1_tester/1, - %% but now, because the TargetN does not divide evenly by the ring size - %% only TargetN - 1 can be achieved for locations. - JoiningNodes = - [{l1n2, loc1}, - {l2n3, loc2}, {l2n4, loc2}, - {l3n5, loc3}, {l3n6, loc3} - ], - location_multistage_claim_tester(64, JoiningNodes, 3, l4n7, loc4, 2), - location_multistage_claim_tester(128, JoiningNodes, 3, l4n7, loc4, 2), - location_multistage_claim_tester(256, JoiningNodes, 3, l4n7, loc4, 2), - location_multistage_claim_tester(512, JoiningNodes, 3, l4n7, loc4, 2), - location_multistage_claim_tester(1024, JoiningNodes, 3, l4n7, loc4, 2), - location_multistage_claim_tester(2048, JoiningNodes, 3, l4n7, loc4, 2). - -location_multistage_t3_tester() -> - JoiningNodes = - [{l1n2, loc1}, - {l2n3, loc2}, {l2n4, loc2}, - {l3n5, loc3}, {l3n6, loc3}, - {l4n7, loc4}, {l4n8, loc4}, - {l5n9, loc5} - ], - - location_multistage_claim_tester(64, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(128, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(256, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(1024, JoiningNodes, 4, l5n10, loc5, 4), - location_multistage_claim_tester(2048, JoiningNodes, 4, l5n10, loc5, 4). - -location_multistage_claim_tester( - RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> - SW0 = os:timestamp(), - N1 = l1n1, - N1Loc = loc1, - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|JoiningNodes], RingSize] - ), - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), - - RAll = - lists:foldl( - fun({N, L}, AccR) -> - AccR0 = riak_core_ring:add_member(N1, AccR, N), - riak_core_ring:set_node_location(N, L, AccR0) - end, - R1, - JoiningNodes - ), - Params = [{target_n_val, TargetN}], - SW1 = os:timestamp(), - RClaimInit = - riak_core_membership_claim:claim( - RAll, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Params}), - SW2 = os:timestamp(), - io:format("Reclaiming without committing~n"), - - RingExtendA = - riak_core_ring:set_node_location( - NewNode, - NewLocation, - riak_core_ring:add_member(N1, RClaimInit, NewNode)), - RClaimExtendA = - riak_core_membership_claim:claim( - RingExtendA, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Params}), - - io:format("Commit initial claim~n"), - SW3 = os:timestamp(), - - RClaimInitCommit = - riak_core_ring:increment_vclock( - node(), - riak_core_ring:clear_location_changed(RClaimInit)), - - io:format("Reclaiming following commit~n"), - SW4 = os:timestamp(), - - RingExtendB = - riak_core_ring:set_node_location( - NewNode, - NewLocation, - riak_core_ring:add_member(N1, RClaimInitCommit, NewNode)), - RClaimExtendB = - riak_core_membership_claim:claim( - RingExtendB, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Params}), - - {_RingSizeInit, MappingsInit} = riak_core_ring:chash(RClaimInit), - {RingSizeA, MappingsA} = riak_core_ring:chash(RClaimExtendA), - {RingSizeB, MappingsB} = riak_core_ring:chash(RClaimExtendB), - - SW5 = os:timestamp(), - - ?assert(RingSizeA == RingSizeB), - ?assert(MappingsA == MappingsB), - - io:format("Testing initial Mappings:~n~n~p~n", [MappingsInit]), - check_for_failures(MappingsInit, VerifyN, RClaimInit), - io:format("Testing secondary Mappings:~n~n~p~n", [MappingsB]), - check_for_failures(MappingsB, VerifyN, RClaimExtendB), - - SW6 = os:timestamp(), - io:format( - "Test for RingSize ~w had timings:" - "Setup ~w First Claim ~w Next Claim ~w Commit ~w Other Claims ~w Verify ~w~n", - [RingSize, - timer:now_diff(SW1, SW0) div 1000, - timer:now_diff(SW2, SW1) div 1000, - timer:now_diff(SW3, SW2) div 1000, - timer:now_diff(SW4, SW3) div 1000, - timer:now_diff(SW5, SW4) div 1000, - timer:now_diff(SW6, SW5) div 1000] - ). - -location_typical_expansion_test() -> - location_typical_expansion_tester(256), - location_typical_expansion_tester(512). - -location_typical_expansion_tester(RingSize) -> - N1 = l1n1, - N1Loc = loc1, - TargetN = 4, - InitJoiningNodes = - [{l1n2, loc1}, - {l2n3, loc2}, {l2n4, loc2}, - {l3n5, loc3}, {l3n6, loc3}, - {l4n7, loc4}, {l4n8, loc4}], - - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|InitJoiningNodes], RingSize] - ), - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), - - RAll = - lists:foldl( - fun({N, L}, AccR) -> - AccR0 = riak_core_ring:add_member(N1, AccR, N), - riak_core_ring:set_node_location(N, L, AccR0) - end, - R1, - InitJoiningNodes - ), - Params = [{target_n_val, TargetN}], - RClaimInit = - riak_core_membership_claim:claim( - RAll, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Params}), - {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), - - check_for_failures(MappingsInit, TargetN, RClaimInit), - - Stage1Ring = - lists:foldl( - fun(JN, R) -> - riak_core_ring:set_member(node(), R, JN, valid, same_vclock) - end, - RClaimInit, - riak_core_ring:members(RClaimInit, [joining]) - ), - - RClaimStage2 = add_node(Stage1Ring, N1, l5n9, loc5, Params), - {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), - check_for_failures(Mappings2, TargetN, RClaimStage2), - Stage2Ring = commit_change(RClaimStage2), - - RClaimStage3 = add_node(Stage2Ring, N1, l5n10, loc5, Params), - {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), - check_for_failures(Mappings3, TargetN, RClaimStage3), - Stage3Ring = commit_change(RClaimStage3), - - RClaimStage4 = add_node(Stage3Ring, N1, l6n11, loc6, Params), - {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage4), - check_for_failures(Mappings4, TargetN, RClaimStage4), - Stage4Ring = commit_change(RClaimStage4), - - RClaimStage5 = add_node(Stage4Ring, N1, l6n12, loc6, Params), - {RingSize, Mappings5} = riak_core_ring:chash(RClaimStage5), - check_for_failures(Mappings5, TargetN, RClaimStage5), - Stage5Ring = commit_change(RClaimStage5), - - RClaimStage6 = add_node(Stage5Ring, N1, l1n13, loc1, Params), - {RingSize, Mappings6} = riak_core_ring:chash(RClaimStage6), - check_for_failures(Mappings6, TargetN, RClaimStage6), - Stage6Ring = commit_change(RClaimStage6), - - RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), - {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), - check_for_failures(Mappings7, TargetN, RClaimStage7), - Stage7Ring = commit_change(RClaimStage7), - - RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), - {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), - check_for_failures(Mappings8, TargetN, RClaimStage8), - Stage8Ring = commit_change(RClaimStage8), - - RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), - {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), - check_for_failures(Mappings9, TargetN, RClaimStage9), - _Stage9Ring = commit_change(RClaimStage9). - - -add_node(Ring, Claimant, Node, Location, Params) -> - RingA = riak_core_ring:add_member(Claimant, Ring, Node), - RingB = riak_core_ring:set_node_location(Node, Location, RingA), - RingC = - riak_core_membership_claim:claim( - RingB, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_claim_location, choose_claim_v4, Params}), - OwnersPre = riak_core_ring:all_owners(RingA), - OwnersPost = riak_core_ring:all_owners(RingC), - OwnersZip = lists:zip(OwnersPre, OwnersPost), - Next = - [{Idx, PrevOwner, NewOwner, [], awaiting} || - {{Idx, PrevOwner}, {Idx, NewOwner}} <- OwnersZip, - PrevOwner /= NewOwner], - % StartingNodes = riak_core_ring:all_members(Ring), - % ExpectedTransferMax = 2 * (length(OwnersPre) div length(StartingNodes)), - NodeCountD = - lists:foldl( - fun({_Idx, N}, D) -> - dict:update_counter(N, 1, D) - end, - dict:new(), - OwnersPost - ), - NodeCounts = - lists:map(fun({_N, C}) -> C end, dict:to_list(NodeCountD)), - io:format( - % user, - "NodeCounts~w~n", - [dict:to_list(NodeCountD)]), - io:format( - % user, - "Adding node ~w in location ~w - ~w transfers ~w max ~w min vnodes~n", - [Node, Location, - length(Next), lists:max(NodeCounts), lists:min(NodeCounts)]), - ?assert( - (lists:min(NodeCounts) == (lists:max(NodeCounts) - 1)) or - (lists:min(NodeCounts) == lists:max(NodeCounts)) - ), - % ?assert(length(Next) =< ExpectedTransferMax), - RingC. - -commit_change(Ring) -> - lists:foldl( - fun(JN, R) -> - riak_core_ring:set_member(node(), R, JN, valid, same_vclock) - end, - Ring, - riak_core_ring:members(Ring, [joining]) - ). - --endif. \ No newline at end of file diff --git a/src/riak_core_claim_sim.erl b/src/riak_core_claim_sim.erl index d003e18fe..9b45026c9 100644 --- a/src/riak_core_claim_sim.erl +++ b/src/riak_core_claim_sim.erl @@ -520,7 +520,7 @@ commission_claims() -> [{{riak_core_membership_claim, wants_claim_v2}, {riak_core_membership_claim, choose_claim_v2}}, {{riak_core_membership_claim, wants_claim_v3}, - {riak_core_memberhsip_claim, choose_claim_v3}}]. + {riak_core_membership_claim, choose_claim_v3}}]. %% ------------------------------------------------------------------- diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl new file mode 100644 index 000000000..057778af5 --- /dev/null +++ b/src/riak_core_claim_swapping.erl @@ -0,0 +1,796 @@ +%% ------------------------------------------------------------------- +%% +%% riak_core: Core Riak Application +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +%% @doc This claim algorithm works for location awareness. +%% In a ring, nodes may be at different locations and the algorithm +%% tries to computes a claim that respects target_n_val w.r.t. +%% those locations. This implies that the nodes themselve meet +%% target_n_val, since each node lives in exactly one location. +%% +%% The algorithm allows for two different target_n_val values, one +%% for nodes and one for locations. However, great care should be +%% taken when using different values. +%% +%% Nodes that do not have a location associated with it, will end up +%% all in the same dummy location. This means that if there are no +%% locations at all, they all end up in the same location. +%% This would normally mean we cannot meet the target_n_val. Therefore, +%% we treat the case without locations as a special case and +%% only look at the target_n_val for nodes when there is no location +%% defined. +%% +%% We always start from a given ring to get a solution and try +%% best effort to find a solution with a minimal amount of transfers. +%% If we cannot find such a solution, we fall back to generating +%% a solution from scratch, which may involve many transfers. +%% +%% Not all configurations do have a solution. If no solution can be found, +%% the algorithm provides a best effort solution. + +-module(riak_core_claim_swapping). + +-export([claim/1, claim/2, + choose_claim_v4/3]). + +-ifdef(TEST). +-export([to_binring/2, to_config2/2]). +-endif. + +%% The algorithm does not use any wants claim logic. +%% For backward compatibility one can combine wants_claim_v2 with the choose here + +%% Backward compatible interface +-spec choose_claim_v4(riak_core_ring:riak_core_ring(), node(), [{atom(), term()}]) -> + riak_core_ring:riak_core_ring(). +choose_claim_v4(Ring, _Node, Params) -> + claim(Ring, Params). + +-spec claim(riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). +claim(Ring) -> + Params = riak_core_membership_claim:default_choose_params(), + claim(Ring, Params). + +-spec claim(riak_core_ring:riak_core_ring(), [{atom(), term()}]) -> + riak_core_ring:riak_core_ring(). +claim(Ring, Params0) -> + Params = riak_core_membership_claim:default_choose_params(Params0), + TargetN = proplists:get_value(target_n_val, Params), + LocationDict = riak_core_ring:get_nodes_locations(Ring), + HasLocations = riak_core_location:has_location_set_in_cluster(LocationDict), + %% all locations, even those that may be empty because claimants have left + TargetLN = + if HasLocations -> proplists:get_value(target_location_n_val, Params, TargetN); + true -> 1 + end, + RingSize = riak_core_ring:num_partitions(Ring), + NVals = {TargetN, TargetLN}, + + %% Now we need to map the locations and nodes to a configuration that + %% basically is a list of locations with the number of nodes in it. + %% We compute both the old and the new ring, such that we can perform updates. + %% This is mapped back after the algorithm is applied. + %% Therefore it is important to have leaving nodes mapped to + %% indices that are not occuring in the new ring + + {BinRing0, OldLocRel} = to_binring(Ring), + {Config, LocRel} = to_config(Ring, OldLocRel), + + %% io:format("Config = ~p RingSize ~p nval ~p\n", [Config, RingSize, NVals]), + BinRing1 = riak_core_claim_binring_alg:update(BinRing0, Config, NVals), + + BinRing = + case riak_core_claim_binring_alg:zero_violations(BinRing1, NVals) of + false -> + riak_core_claim_binring_alg:solve(RingSize, Config, NVals); + true -> + BinRing1 + end, + + Inc = chash:ring_increment(RingSize), + SolvedNodes = + [ begin + {_Loc, Node} = proplists:get_value({LocIdx, NodeIdx}, LocRel), + {Inc * (Idx-1), Node} + end || {Idx, {LocIdx, NodeIdx}} <- enumerate(riak_core_claim_binring_alg:to_list(BinRing)) ], + + NewRing = + lists:foldl( + fun({Idx, N}, Ring0) -> + riak_core_ring:transfer_node(Idx, N, Ring0) + end, + Ring, + SolvedNodes), + + NewRing. + + +to_binring(Ring) -> + LocationDict = riak_core_ring:get_nodes_locations(Ring), + LeavingMembers = riak_core_ring:members(Ring, [leaving]), + %% Make sure leaving members at the end + AllOwners = + [ Owner || {_, Owner} <- riak_core_ring:all_owners(Ring)], + + LocationRing = + [ {riak_core_location:get_node_location(N, LocationDict), N} || N <- AllOwners ], + to_binring(LocationRing, LeavingMembers). + +to_binring(LocationRing, LeavingMembers) -> + Locs = lists:usort([ L || {L, _} <- LocationRing ]), + LocNodes = [ {Loc, uleaving_last([N || {L, N} <- LocationRing, L == Loc], LeavingMembers)} + || Loc <- Locs ], + + LocationRel = + [{{LocIdx, Idx}, {Loc, N}} || {LocIdx, {Loc, Ns}} <- enumerate(LocNodes), + {Idx, N} <- enumerate(Ns)], + + Nodes = [ begin + {Node, _} = lists:keyfind({L, N}, 2, LocationRel), + Node + end || {L, N} <- LocationRing ], + {riak_core_claim_binring_alg:from_list(Nodes), LocationRel}. + +to_config(Ring, OldLocRel) -> + Claiming = riak_core_ring:claiming_members(Ring), + LocationDict = riak_core_ring:get_nodes_locations(Ring), + LocationNodes = [ {riak_core_location:get_node_location(N, LocationDict), N} || N <- Claiming ], + to_config2(LocationNodes, OldLocRel). + +to_config2(LocationNodes, FixedLocRel) -> + OldLocIdxs = lists:usort([ {LI, L} || {{LI, _}, {L,_}} <- FixedLocRel ]), + OldLocs = [ L || {_, L} <- OldLocIdxs ], + + %% keep order of locations the same as in old ring + Locs = lists:usort([ L || {L, _} <- LocationNodes ]++OldLocs), + NewLocs = Locs -- OldLocs, + + LocIdxs = OldLocIdxs ++ enumerate(length(OldLocs) + 1, NewLocs), + lists:foldl(fun({LocIdx, Loc}, {Cfg, Rel}) -> + RelAtLoc = order_nodes_at_loc(Loc, LocIdx, LocationNodes, FixedLocRel), + {Cfg ++ [length(RelAtLoc)], Rel ++ RelAtLoc} + end, {[], []}, LocIdxs). + +order_nodes_at_loc(Loc, LocIdx, LocationNodes, FixedLocRel) -> + {Old, New} = + lists:foldl(fun({L, _}, Acc) when L /= Loc -> Acc; + (LocNode, {OA, NA}) -> + case lists:keyfind(LocNode, 2, FixedLocRel) of + false -> + {OA, [LocNode | NA]}; + Found -> + {[Found|OA], NA} + end + end, {[], []}, LocationNodes), + Old ++ [{{LocIdx, Idx}, LocNode} || {Idx, LocNode} <- enumerate(length(Old) + 1, lists:usort(New))]. + + +uleaving_last(Nodes, LeavingNodes) -> + UNodes = lists:usort(Nodes), + ULeaving = lists:usort(LeavingNodes), + uleaving_last(UNodes, ULeaving, UNodes -- ULeaving). + +uleaving_last(_Nodes, [], Acc) -> + Acc; +uleaving_last(Nodes, [Leave|Leaves], Acc) -> + case lists:member(Leave, Nodes) of + true -> uleaving_last(Nodes, Leaves, Acc ++ [Leave]); + false -> uleaving_last(Nodes, Leaves, Acc) + end. + +%% in OTP 25 one can use lists:enumerate +enumerate(List) -> + enumerate(1, List). + +enumerate(Start, List) -> + lists:zip(lists:seq(Start, Start+length(List)-1), List). + +%% =================================================================== +%% eunit tests +%% =================================================================== + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +simple_cluster_t1_test() -> + RingSize = 32, + TargetN = 4, + NodeList = [n1, n2, n3, n4, n5, n6], + R0 = riak_core_ring:fresh(RingSize, n1), + R1 = + lists:foldl( + fun(N, AccR) -> riak_core_ring:add_member(n1, AccR, N) end, + R0, + NodeList -- [n1]), + Props = [{target_n_val, TargetN}], + RClaim = + claim(R1, Props), + ?assert(true, riak_core_membership_claim:meets_target_n(RClaim, TargetN)). + + +location_t1_test_() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3}, + {n7, loc4}, {n8, loc4}, + {n9, loc5}, {n10, loc5} + ], + {"[2, 2, 2, 2, 2] nval 4", + {inparallel, + [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4) + %% Don't test large rings in automated testing + %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) + %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + ]}}. + +location_t2_test_() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3}, + {n7, loc4}, {n8, loc4} + ], + {"[2, 2, 2, 2] nval 4", + {inparallel, + [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4), + location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), + location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + ]}}. + +location_t8_test_() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}], + {"[4, 3, 3, 3] nval 4", + {inparallel, + [location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) + %% Don't test large rings in automated testing + %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + ]}}. + +location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> + {"Ringsize "++integer_to_list(RingSize), + {timeout, 120, + fun() -> + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|NodeLocList], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaim = add_nodes_to_ring(R1, N1, NodeLocList, [{target_n_val, TargetN}]), + {RingSize, Mappings} = riak_core_ring:chash(RClaim), + + check_for_failures(Mappings, TargetN, RClaim) + end}}. + +add_nodes_to_ring(Ring, Claimant, NodeLocList, Params) -> + NewRing = lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(Claimant, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + Ring, + NodeLocList), + claim(NewRing, Params). + + +check_for_failures(Mappings, TargetN, RClaim) -> + Failures = compute_failures(Mappings, TargetN, RClaim), + lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), + ?assert(length(Failures) == 0). + +compute_failures(Mappings, TargetN, RClaim) -> + NLs = riak_core_ring:get_nodes_locations(RClaim), + LocationMap = + lists:map( + fun({Idx, N}) -> + {Idx, riak_core_location:get_node_location(N, NLs)} + end, + Mappings), + Prefix = lists:sublist(LocationMap, TargetN), + CheckableMap = LocationMap ++ Prefix, + {_, Failures} = + lists:foldl( + fun({Idx, L}, {LastNminus1, Fails}) -> + case lists:member(L, LastNminus1) of + false -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], Fails}; + true -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], + [{Idx, L, LastNminus1}|Fails]} + end + end, + {[], []}, + CheckableMap + ), + Failures. + + + +location_multistage_t1_test_() -> + %% This is a tricky corner case where we would fail to meet TargetN for + %% locations if joining all 9 nodes in one claim (as old sequential_claim will + %% not succeed). However, If we join 8 nodes, then add the 9th, TargetN + %% is always achieved + JoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4} + ], + {inparallel, + [ + location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4) + %% location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) + %% location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) + ]}. + + +location_multistage_claim_tester(RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> + {timeout, 240, + {"Ringsize " ++ integer_to_list(RingSize), + fun() -> + SW0 = os:timestamp(), + N1 = l1n1, + N1Loc = loc1, + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|JoiningNodes], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + Params = [{target_n_val, TargetN}], + SW1 = os:timestamp(), + RClaimInit = add_nodes_to_ring(R1, N1, JoiningNodes, Params), + + SW2 = os:timestamp(), + io:format("Reclaiming without committing~n"), + + RingExtendA = + riak_core_ring:set_node_location( + NewNode, + NewLocation, + riak_core_ring:add_member(N1, RClaimInit, NewNode)), + RClaimExtendA = claim(RingExtendA, Params), + + io:format("Commit initial claim~n"), + SW3 = os:timestamp(), + + RClaimInitCommit = + riak_core_ring:increment_vclock( + node(), + riak_core_ring:clear_location_changed(RClaimInit)), + + io:format("Reclaiming following commit~n"), + SW4 = os:timestamp(), + + RingExtendB = + riak_core_ring:set_node_location( + NewNode, + NewLocation, + riak_core_ring:add_member(N1, RClaimInitCommit, NewNode)), + RClaimExtendB = claim(RingExtendB, Params), + + {_RingSizeInit, MappingsInit} = riak_core_ring:chash(RClaimInit), + {RingSizeA, MappingsA} = riak_core_ring:chash(RClaimExtendA), + {RingSizeB, MappingsB} = riak_core_ring:chash(RClaimExtendB), + + SW5 = os:timestamp(), + + ?assert(RingSizeA == RingSizeB), + ?assert(MappingsA == MappingsB), + + io:format("Testing initial Mappings:~n~n~p~n", [MappingsInit]), + check_for_failures(MappingsInit, VerifyN, RClaimInit), + io:format("Testing secondary Mappings:~n~n~p~n", [MappingsB]), + check_for_failures(MappingsB, VerifyN, RClaimExtendB), + + SW6 = os:timestamp(), + io:format( + "Test for RingSize ~w had timings:" + "Setup ~w First Claim ~w Next Claim ~w Commit ~w Other Claims ~w Verify ~w~n", + [RingSize, + timer:now_diff(SW1, SW0) div 1000, + timer:now_diff(SW2, SW1) div 1000, + timer:now_diff(SW3, SW2) div 1000, + timer:now_diff(SW4, SW3) div 1000, + timer:now_diff(SW5, SW4) div 1000, + timer:now_diff(SW6, SW5) div 1000] + ) + end}}. + +location_typical_expansion_test_() -> + {"Typical expansion", + {inparallel, + [location_typical_expansion_tester(64), + location_typical_expansion_tester(128), + location_typical_expansion_tester(256) + %% location_typical_expansion_tester(512) + ]}}. + +location_typical_expansion_tester(RingSize) -> + {timeout, 120, + {"Ringsize "++integer_to_list(RingSize), + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 4, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4}], + + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|InitJoiningNodes], RingSize] + ), + Params = [{target_n_val, TargetN}], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + + check_for_failures(MappingsInit, TargetN, RClaimInit), + + Stage1Ring = commit_change(RClaimInit), + + RClaimStage2 = add_node(Stage1Ring, N1, l5n9, loc5, Params), + {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), + check_for_failures(Mappings2, TargetN, RClaimStage2), + Stage2Ring = commit_change(RClaimStage2), + + RClaimStage3 = add_node(Stage2Ring, N1, l5n10, loc5, Params), + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), + check_for_failures(Mappings3, TargetN, RClaimStage3), + Stage3Ring = commit_change(RClaimStage3), + + RClaimStage4 = add_node(Stage3Ring, N1, l6n11, loc6, Params), + {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage4), + check_for_failures(Mappings4, TargetN, RClaimStage4), + Stage4Ring = commit_change(RClaimStage4), + + RClaimStage5 = add_node(Stage4Ring, N1, l6n12, loc6, Params), + {RingSize, Mappings5} = riak_core_ring:chash(RClaimStage5), + check_for_failures(Mappings5, TargetN, RClaimStage5), + Stage5Ring = commit_change(RClaimStage5), + + RClaimStage6 = add_node(Stage5Ring, N1, l1n13, loc1, Params), + {RingSize, Mappings6} = riak_core_ring:chash(RClaimStage6), + check_for_failures(Mappings6, TargetN, RClaimStage6), + Stage6Ring = commit_change(RClaimStage6), + + RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), + {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), + check_for_failures(Mappings7, TargetN, RClaimStage7), + Stage7Ring = commit_change(RClaimStage7), + + RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), + {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), + check_for_failures(Mappings8, TargetN, RClaimStage8), + Stage8Ring = commit_change(RClaimStage8), + + RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), + {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), + check_for_failures(Mappings9, TargetN, RClaimStage9), + _Stage9Ring = commit_change(RClaimStage9) + end}}. + + +add_node(Ring, Claimant, Node, Location, Params) -> + RingC = add_nodes_to_ring(Ring, Claimant, [{Node, Location}], Params), + + OwnersPre = riak_core_ring:all_owners(Ring), + OwnersPost = riak_core_ring:all_owners(RingC), + OwnersZip = lists:zip(OwnersPre, OwnersPost), + Next = + [{Idx, PrevOwner, NewOwner, [], awaiting} || + {{Idx, PrevOwner}, {Idx, NewOwner}} <- OwnersZip, + PrevOwner /= NewOwner], + + NodeCountD = + lists:foldl( + fun({_Idx, N}, D) -> + dict:update_counter(N, 1, D) + end, + dict:new(), + OwnersPost + ), + NodeCounts = + lists:map(fun({_N, C}) -> C end, dict:to_list(NodeCountD)), + io:format( + % user, + "NodeCounts~w~n", + [dict:to_list(NodeCountD)]), + io:format( + % user, + "Adding node ~w in location ~w - ~w transfers ~w max ~w min vnodes~n", + [Node, Location, + length(Next), lists:max(NodeCounts), lists:min(NodeCounts)]), + ?assert( + (lists:min(NodeCounts) == (lists:max(NodeCounts) - 1)) or + (lists:min(NodeCounts) == lists:max(NodeCounts)) + ), + % ?assert(length(Next) =< ExpectedTransferMax), + RingC. + +commit_change(Ring) -> + lists:foldl( + fun(JN, R) -> + riak_core_ring:set_member(node(), R, JN, valid, same_vclock) + end, + Ring, + riak_core_ring:members(Ring, [joining]) + ). + +leave_node_test_() -> + {inorder, + [leave_node_from_location_test(l4n8), + leave_node_from_location_test(l4n7)]}. + +leave_node_from_location_test(Leaving) -> + {timeout, 120, + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 4, + RingSize = 64, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4}, + {l5n9, loc5}, {l5n10, loc5}], + + Params = [{target_n_val, TargetN}], + LeavingLoc = proplists:get_value(Leaving, InitJoiningNodes), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + + check_for_failures(MappingsInit, TargetN, RClaimInit), + + Stage1Ring = commit_change(RClaimInit), + + %% One node leaves, check it is actually not an owner any more + RLeave = riak_core_ring:leave_member(N1, Stage1Ring, Leaving), + RClaimStage2 = claim(RLeave, Params), + + {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), + Nodes2 = lists:usort([ N || {_, N} <- Mappings2 ]), + check_for_failures(Mappings2, TargetN, RClaimStage2), + ?assert(not lists:member(Leaving, Nodes2)), + + %% We should not change the ring if we rename a node at a certain location: + RAdd1 = + riak_core_ring:set_node_location(l4ne, loc4, + riak_core_ring:add_member(N1, Stage1Ring, l4ne)), + RLeave1 = + riak_core_ring:leave_member(N1, RAdd1, Leaving), + RClaimStage3 = claim(RLeave1, Params), + + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), + check_for_failures(Mappings3, TargetN, RClaimStage3), + Diffs = [ {Idx, N} || {Idx, N} <- Mappings3, + case proplists:get_value(Idx, MappingsInit) of + Leaving -> + not (N == l4ne orelse + %% balanced by another node at that location + lists:member(N, [Node || {Node, Loc} <- InitJoiningNodes, Loc == LeavingLoc])); + OldN -> + OldN /= N + end + ], + ?assertEqual(Diffs, []) + end}. + +six_node_location_test_() -> + {timeout, 120, + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 4, + RingSize = 32, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n1, loc2}, {l2n2, loc2}, + {l3n1, loc3}, {l3n2, loc3}], + + Params = [{target_n_val, TargetN}], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + PrefLists = riak_core_ring:all_preflists(RClaimInit, TargetN), + + %% Sometimes one can be lucky and get node target N 4 with 3 our of 4 different + %% locations. This is not the same as targetN 3 for locations. + + LocPrefs = + lists:map(fun(PL) -> + [ proplists:get_value(Node, [{l1n1, loc1}|InitJoiningNodes]) || {_, Node} <- PL ] + end, PrefLists), + + ?assert(lists:all(fun(PL) -> + length(PL) == 4 andalso length(lists:usort(PL)) == 3 + end, LocPrefs)) + end}. + + +leave_location_test_() -> + {timeout, 120, + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 3, + RingSize = 64, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n1, loc3}, {l3n2, loc3}, + {l4n1, loc4}, {l4n2, loc4}, + {l5n1, loc5}], + + Params = [{target_n_val, TargetN}], + LeaveLoc = loc3, + LeaveNodes = [ N || {N, Loc} <- InitJoiningNodes, Loc == LeaveLoc ], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + + check_for_failures(MappingsInit, TargetN, RClaimInit), + + Stage1Ring = commit_change(RClaimInit), + + %% One location leaves, check nodes no longer owner + RLeave = + lists:foldl(fun(N, R) -> + riak_core_ring:leave_member(N1, R, N) + end, Stage1Ring, LeaveNodes), + RClaimStage2 = claim(RLeave, Params), + + {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), + Nodes2 = lists:usort([ N || {_, N} <- Mappings2 ]), + check_for_failures(Mappings2, TargetN, RClaimStage2), + ?assertEqual(Nodes2 -- LeaveNodes, Nodes2), + + %% We should not move nodes in locations that are not involved if + %% we "rename" a location by leaving a node in one location + %% and adding one to the next [2, 2, 2, 2, 1] -> [2, 2, 2, 1, 2]: + RAdd1 = + riak_core_ring:set_node_location(l5n2, loc5, + riak_core_ring:add_member(N1, Stage1Ring, l5n2)), + RLeave3 = + riak_core_ring:leave_member(N1, RAdd1, l4n2), + RClaimStage3 = claim(RLeave3, Params), + InvolvedNodes = [ N || {N, Loc} <- InitJoiningNodes ++ [{l5n2, loc5}], + Loc == loc4 orelse Loc == loc5 ], + + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), + check_for_failures(Mappings3, TargetN, RClaimStage3), + Diffs = [ {Idx, N} + || {Idx, N} <- Mappings3, + not case lists:member(proplists:get_value(Idx, MappingsInit), InvolvedNodes) of + true -> + lists:member(N, InvolvedNodes); + false -> + N == proplists:get_value(Idx, MappingsInit) + end + ], + ?assertEqual(Diffs, []) + end}. + +move_location_test_() -> + {timeout, 120, + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 2, + RingSize = 16, + InitJoiningNodes = + [{l2n1, loc2}, + {l3n1, loc3}, {l3n2, loc3}], + + Params = [{target_n_val, TargetN}], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + + check_for_failures(MappingsInit, TargetN, RClaimInit), + + Stage1Ring = commit_change(RClaimInit), + + %% [1, 1, 2] -> [2, 0, 2] + RMove = + riak_core_ring:set_node_location(l2n1, loc1, Stage1Ring), + RClaimStage2 = claim(RMove, Params), + + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage2), + check_for_failures(Mappings3, TargetN, RClaimStage2), + + ?assertEqual(riak_core_ring:chash(RClaimStage2), riak_core_ring:chash(Stage1Ring)), + RAdd = + riak_core_ring:set_node_location(l1n2, loc1, + riak_core_ring:add_member(N1, Stage1Ring, l1n2)), + RLeave = + riak_core_ring:leave_member(N1, RAdd, l2n1), + RClaimStage3 = claim(RLeave, Params), + {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage3), + + Diffs = [ {Idx, N} || {Idx, N} <- Mappings4, + case proplists:get_value(Idx, MappingsInit) of + l2n1 -> + N /= l1n2; + OldN -> + OldN /= N + end + ], + ?assertEqual(Diffs, []) + + end}. + + +-endif. diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index 9b862e010..f82cc324e 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -453,8 +453,7 @@ maybe_commit_staged(Ring, NextRing, #state{next_ring=PlannedRing}) -> {_, _, false} -> {ignore, plan_changed}; _ -> - NewRing0 = riak_core_ring:clear_location_changed(NextRing), - NewRing1 = riak_core_ring:increment_vclock(Claimant, NewRing0), + NewRing1 = riak_core_ring:increment_vclock(Claimant, NextRing), {new_ring, NewRing1} end. @@ -672,9 +671,7 @@ same_plan(RingA, RingB) -> (riak_core_ring:pending_changes(RingA) == riak_core_ring:pending_changes(RingB)). schedule_tick() -> - Tick = app_helper:get_env(riak_core, - claimant_tick, - 10000), + Tick = app_helper:get_env(riak_core, claimant_tick, 10000), erlang:send_after(Tick, ?MODULE, tick). tick(PreFetchRing, RingID, State=#state{last_ring_id=LastID}) -> @@ -1355,7 +1352,7 @@ update_ring(CNode, CState, Replacing, Seed, Log, false) -> %% Rebalance the ring as necessary. If pending changes exist ring %% is not rebalanced - Next3 = rebalance_ring(CNode, CState4), + {Next3, LocationChanged} = rebalance_ring(CState4), Log(debug,{"Pending ownership transfers: ~b~n", [length(riak_core_ring:pending_changes(CState4))]}), @@ -1373,7 +1370,10 @@ update_ring(CNode, CState, Replacing, Seed, Log, false) -> ?ROUT("Updating ring :: next3 : ~p~n", [Next4]), CState5 = riak_core_ring:set_pending_changes(CState4, Next4), CState6 = riak_core_ring:increment_ring_version(CNode, CState5), - {true, CState6}; + CState7 = + riak_core_ring:force_location_changed( + CState6, LocationChanged), + {true, CState7}; false -> {false, CState} end; @@ -1451,11 +1451,14 @@ reassign_indices(CState, Replacing, Seed, Log) -> {RingChanged or NextChanged, CState3}. %% @private -rebalance_ring(CNode, CState) -> +-spec rebalance_ring( + riak_core_ring:riak_core_ring()) -> + {[{non_neg_integer(), node(), node(), list(), awaiting}], boolean()}. +rebalance_ring(CState) -> Next = riak_core_ring:pending_changes(CState), - rebalance_ring(CNode, Next, CState). + rebalance_ring(Next, CState). -rebalance_ring(_CNode, [], CState) -> +rebalance_ring([], CState) -> CState2 = riak_core_membership_claim:claim(CState), Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CState2), @@ -1463,9 +1466,9 @@ rebalance_ring(_CNode, [], CState) -> Next = [{Idx, PrevOwner, NewOwner, [], awaiting} || {{Idx, PrevOwner}, {Idx, NewOwner}} <- Owners3, PrevOwner /= NewOwner], - Next; -rebalance_ring(_CNode, Next, _CState) -> - Next. + {Next, riak_core_ring:has_location_changed(CState2)}; +rebalance_ring(Next, CState) -> + {Next, riak_core_ring:has_location_changed(CState)}. %% @private handle_down_nodes(CState, Next) -> diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 59a60b13f..b35f5f48e 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -106,7 +106,7 @@ claim(Ring) -> choose_claim_v3 -> {riak_core_membership_claim, choose_claim_v3}; choose_claim_v4 -> - {riak_core_claim_location, choose_claim_v4}; + {riak_core_claim_swapping, choose_claim_v4}; {CMod, CFun} -> {CMod, CFun} end, @@ -119,22 +119,17 @@ claim(Ring) -> riak_core_ring:riak_core_ring(), {module(), atom()}, choose_function()) -> riak_core_ring:riak_core_ring(). + claim(Ring, {WMod, WFun}=Want, Choose) -> Members = riak_core_ring:claiming_members(Ring), - Owners = - lists:usort( - lists:map( - fun({_Idx, N}) -> N end, - riak_core_ring:all_owners(Ring))), NoInitialWants = lists:all( fun(N) -> apply(WMod, WFun, [Ring, N]) == no end, Members), - SortedMembers = sort_members_for_choose(Ring, Members, Owners, Choose), case NoInitialWants of true -> case riak_core_ring:has_location_changed(Ring) of true -> - [HeadMember|_Rest] = SortedMembers, + [HeadMember|_Rest] = Members, choose_new_ring( riak_core_ring:clear_location_changed(Ring), HeadMember, @@ -147,8 +142,8 @@ claim(Ring, {WMod, WFun}=Want, Choose) -> fun(Node, Ring0) -> claim_until_balanced(Ring0, Node, Want, Choose) end, - riak_core_ring:clear_location_changed(Ring), - SortedMembers) + Ring, + Members) end. -spec choose_new_ring( @@ -162,23 +157,7 @@ choose_new_ring(Ring, Node, Choose) -> CMod:CFun(Ring, Node, Params) end. -%% @doc -%% The order by which members are passed in to claim may make a difference -%% to the outcome, so prepare to allow for this order to be changeable in -%% different claim versions --spec sort_members_for_choose( - riak_core_ring:riak_core_ring(), - list(node()), - list(node()), - choose_function()) -> list(node()). -sort_members_for_choose(Ring, Members, Owners, Choose) -> - CMod = element(1, Choose), - case erlang:function_exported(CMod, sort_members_for_choose, 3) of - true -> - CMod:sort_members_for_choose(Ring, Members, Owners); - false -> - Members - end. + -spec claim_until_balanced( riak_core_ring:riak_core_ring(), node()) -> @@ -243,7 +222,7 @@ get_target_n() -> %% =================================================================== -%% Claim Function Implementations +%% Claim Function Implementations %% =================================================================== -spec wants_claim_v2( @@ -254,7 +233,7 @@ wants_claim_v2(Ring) -> -spec wants_claim_v2( riak_core_ring:riak_core_ring(), node()) -> - no|{yes, non_neg_integer()}. + no|{yes, non_neg_integer()}. wants_claim_v2(Ring, Node) -> Active = riak_core_ring:claiming_members(Ring), Owners = riak_core_ring:all_owners(Ring), @@ -277,13 +256,13 @@ wants_claim_v3(Ring) -> wants_claim_v3(Ring, _Node) -> Wants = wants(Ring), - + %% This case will only hold true during claim_until_balanced %% as only the ownership information is transferred after %% running claim not the metadata. case riak_core_ring:get_meta(claimed, Ring) of {ok, {claim_v3, Wants}} -> - lager:debug("WantsClaim3(~p) no. Current ring claimed for ~p\n", + lager:debug("WantsClaim3(~p) no. Current ring claimed for ~p\n", [_Node, Wants]), no; {ok, {claim_v3, CurWants}} -> @@ -301,7 +280,7 @@ wants_claim_v3(Ring, _Node) -> "unsetting force_reclaim"), {yes, 1}; false -> - %% Otherwise, base wants decision on whether the current + %% Otherwise, base wants decision on whether the current %% wants versus current ownership if the claim does not %% manage to claim all requested nodes then the temporary %% 'claim_v3' metadata will stop the loop @@ -311,7 +290,7 @@ wants_claim_v3(Ring, _Node) -> Diffs = lists:sum([abs(Diff) || {_, Diff} <- Deltas]), case Diffs of 0 -> - lager:debug("WantsClaim3(~p) no. All wants met.\n", + lager:debug("WantsClaim3(~p) no. All wants met.\n", [_Node]), no; _ -> @@ -489,7 +468,7 @@ increase_takes([], N, _Max, Acc) when N < 0 -> [{Node, Delta} || {Node, _Own, Delta} <- lists:usort(Acc)]; increase_takes([{Node, Own, Delta} | Rest], N, Max, Acc) when Delta > 0 -> WouldOwn = Own + Delta, - Additive = + Additive = case (WouldOwn + 1) =< Max of true -> 1; false -> 0 @@ -523,7 +502,7 @@ meets_target_n([], TargetN, Index, First, Last) -> %% start through end guarantees TargetN %% compute violations at wrap around, but don't fail %% because of them: handle during reclaim - Violations = + Violations = lists:filter( fun({Node, L, _}) -> {Node, F} = proplists:lookup(Node, First), @@ -573,7 +552,7 @@ choose_claim_v3(Ring, _ClaimNode, Params) -> R0; ({P, _OldOwn, NewOwn}, R0) -> riak_core_ring:transfer_node(P, NewOwn, R0) - end, Ring, + end, Ring, lists:zip3(Partitions, Owners, NewOwners)), riak_core_ring:update_meta(claimed, {claim_v3, Wants}, NewRing). @@ -587,7 +566,7 @@ choose_claim_v3(Ring, _ClaimNode, Params) -> %% %% Balance is a measure of the number of partitions owned versus the number of partitions %% wanted. Want is supplied to the algorithm by the caller as a list of node/counts. The -%% score for deviation is the RMS of the difference between what the node wanted and what it +%% score for deviation is the RMS of the difference between what the node wanted and what it %% has. Lower is better, 0 if all wants are mets. %% %% Diversity measures how often nodes are close to one another in the preference @@ -691,14 +670,19 @@ sequential_claim(Ring0, Node, TargetN) -> MinFetchesPerSeq = ceiling(Shortfall / CompleteSequences), CanSolveViolation = ((CompleteSequences * MaxFetchesPerSeq) >= Shortfall), - Zipped = case (HasTailViolation andalso CanSolveViolation) of - true-> - Partitions = lists:sort([ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), - Nodelist = solve_tail_violations(RingSize, Nodes, Shortfall, MinFetchesPerSeq), - lists:zip(Partitions, lists:flatten(Nodelist)); - false -> - diagonal_stripe(Ring, Nodes) - end, + Zipped = + case (HasTailViolation andalso CanSolveViolation) of + true-> + Partitions = + lists:sort( + [ I || {I, _} <- riak_core_ring:all_owners(Ring) ]), + Nodelist = + solve_tail_violations( + RingSize, Nodes, Shortfall, MinFetchesPerSeq), + lists:zip(Partitions, lists:flatten(Nodelist)); + false -> + diagonal_stripe(Ring, Nodes) + end, lists:foldl(fun({P, N}, Acc) -> riak_core_ring:transfer_node(P, N, Acc) @@ -907,7 +891,7 @@ spaced_by_n(NthA, NthB, TargetN, RingSize) -> %% @private %% -%% @doc Build node info list from Wants and Owners. +%% @doc Build node info list from Wants and Owners. build_nis(Wants, Owners) -> Initial = [{N, orddict:new()} || {N, _W} <- Wants], {_, Ownership} = lists:foldl(fun(N, {I,A}) -> @@ -924,7 +908,7 @@ wants_owns_diff(Wants, Owns) -> false -> {N,W} end || {N, W} <- Wants ]. - + %% Given a ring, work out how many partition each wants to be %% considered balanced wants(Ring) -> @@ -936,11 +920,11 @@ wants(Ring) -> lists:sort(ActiveWants ++ InactiveWants). %% @private -%% Given a number of nodes and ring size, return a list of +%% Given a number of nodes and ring size, return a list of %% desired ownership, S long that add up to Q wants_counts(S, Q) -> Max = roundup(Q / S), - case S * Max - Q of + case S * Max - Q of 0 -> lists:duplicate(S, Max); X -> @@ -1022,7 +1006,7 @@ balance(Wants, NIs) -> Diff * Diff end || {Node, Want} <- Wants]). -%% @private +%% @private %% Make the number of plans requested make_plans(NumPlans, NIs, Q, TN) -> lists:usort([make_plan(NIs, Q, TN) || _ <- lists:seq(1,NumPlans)]). @@ -1055,11 +1039,11 @@ violations(NIs, Q, TN) -> [begin Give = length(V), Take = gt0(Want - length(Idxs)), - {Node, Give, Take, VIdxs} + {Node, Give, Take, VIdxs} end || {Node, Want, Idxs} <- NIs, {Node1, V} <- NodeViolations, Node == Node1]. %% @private -%% Return a list of exchanges to fix overloads +%% Return a list of exchanges to fix overloads overloads(NIs) -> OLIdxs = ordsets:from_list(lists:flatten([Idxs || {_Node, Want, Idxs} <- NIs, length(Idxs) > Want])), @@ -1077,9 +1061,9 @@ overloads(NIs) -> %% @private %% Given a list of Exchanges of the form [{Node, #Give, #Take, ClaimableIdxs}] %% randomly select from exchanges until there are no more nodes that wish to take -%% indices that can. Update the owned indexes in the provided NodeInfos +%% indices that can. Update the owned indexes in the provided NodeInfos %% of the form [{Node, Want, OwnedIdxs]} -%% +%% take_idxs(Exchanges, NIs, Q, TN) -> %% work out globally unavailable indexes from nodes that do not wish %% to give any indices- find OIdxs for all exchanges with give=0 @@ -1088,13 +1072,13 @@ take_idxs(Exchanges, NIs, Q, TN) -> [OIdxs || {Node, 0, _Take, _CIdxs} <- Exchanges, {Node1, _Want, OIdxs} <- NIs, Node == Node1])), - %% Remove any exchanges in GUIdxs or that would violate TN for the node - Exchanges1 = [{Node, Give, Take, remove_unclaimable(CIdxs, GUIdxs, Node, NIs, Q, TN)} || + %% Remove any exchanges in GUIdxs or that would violate TN for the node + Exchanges1 = [{Node, Give, Take, remove_unclaimable(CIdxs, GUIdxs, Node, NIs, Q, TN)} || {Node, Give, Take, CIdxs} <- Exchanges], %% Recursively take indices until all takes are satisfied take_idxs0(Exchanges1, NIs, Q, TN). - + take_idxs0(Exchanges, NIs, Q, TN) -> %% Pick a recipient for a claimed index case [{Node, CIdxs} || {Node, _Give, Take, CIdxs} <- Exchanges, Take > 0, CIdxs /= []] of @@ -1107,12 +1091,12 @@ take_idxs0(Exchanges, NIs, Q, TN) -> %% Find the owner of CIdx and remove it from the giving node owned idxs in NIs [ {GNode, GWant, GOIdxs} ] = [ T || {_Node, _GWant, GIdxs}=T <- NIs, ordsets:is_element(CIdx, GIdxs) ], - NIs1 = lists:keyreplace(GNode, 1, NIs, + NIs1 = lists:keyreplace(GNode, 1, NIs, {GNode, GWant, ordsets:del_element(CIdx, GOIdxs)}), %% Add CIdx to owned indices in NIs {TNode, TWant, TOIdxs} = lists:keyfind(TNode, 1, NIs1), - NIs2 = lists:keyreplace(TNode, 1, NIs1, + NIs2 = lists:keyreplace(TNode, 1, NIs1, {TNode, TWant, ordsets:add_element(CIdx, TOIdxs)}), %% If the Give count is zero in the recipients it has given up all it is prepared @@ -1121,7 +1105,7 @@ take_idxs0(Exchanges, NIs, Q, TN) -> %% Also remove the indices within TN of CIdx from the TakeNode {GNode, GGive, _GTake, _GCIdxs} = lists:keyfind(GNode, 1, Exchanges), - %% Update the recipients list, removing any nodes that have taken the + %% Update the recipients list, removing any nodes that have taken the %% number they requested from the recipients list, and removing the %% indices owned by any nodes that have given all they wanted. @@ -1132,7 +1116,7 @@ take_idxs0(Exchanges, NIs, Q, TN) -> GOIdxs end, %% Indexes unclaiamble by the take node - TUIdxs = ordsets:union(UIdxs, + TUIdxs = ordsets:union(UIdxs, ordsets:from_list(expand_idx(CIdx, Q, TN))), Exchanges2 = lists:foldl( @@ -1163,19 +1147,19 @@ remove_unclaimable(CIdxs, GUIdxs, Node, NIs, Q, TN) -> Off <- lists:seq(1, TN - 1) ])), ordsets:subtract(ordsets:subtract(CIdxs, NUIdxs), GUIdxs). -%% @private +%% @private %% Return the value if greater than zero, otherwise zero gt0(I) when I > 0 -> I; gt0(_) -> 0. -%% @private +%% @private %% Pick a random element from the list random_el(L) -> lists:nth(urand(length(L)), L). -%% @private +%% @private %% Return a random number between Low, High inclusive %% Abstract away choice of random number generation urand(High) -> @@ -1216,13 +1200,15 @@ circular_distance(I1, I2, Q) -> -spec get_nodes_by_location([node()|undefined], riak_core_ring:riak_core_ring()) -> [node()|undefined]. get_nodes_by_location(Nodes, Ring) -> - NodesLocations = riak_core_ring:get_nodes_locations(Ring), - case riak_core_location:has_location_set_in_cluster(NodesLocations) of - false -> - Nodes; - true -> - riak_core_location:stripe_nodes_by_location(Nodes, NodesLocations) - end. + NodesLocations = riak_core_ring:get_nodes_locations(Ring), + NodeList = + case riak_core_location:has_location_set_in_cluster(NodesLocations) of + false -> + Nodes; + true -> + riak_core_location:stripe_nodes_by_location(Nodes, NodesLocations) + end, + NodeList. %% =================================================================== %% Unit tests @@ -1262,6 +1248,144 @@ has_violations(Diag) -> Overhang = RS rem NC, (Overhang > 0 andalso Overhang < 4). %% hardcoded target n of 4 +%% Test that if there is no solution without violations, we still present +%% a balanced "solution" in finite time +impossible_config_test_() -> + {timeout, 120, + fun() -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 2, + RingSize = 16, + InitJoiningNodes = + [{l2n1, loc2}, + {l2n2, loc2}, {l2n3, loc2}, + {l2n4, loc2}, {l2n5, loc2}], + + Params = [{target_n_val, TargetN}], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaimInit = lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + InitJoiningNodes), + %% Use the entry for ?MODULE here: + NewRing = + claim(RClaimInit, + {?MODULE, default_wants_claim}, + {riak_core_claim_swapping, choose_claim_v4, Params}), + + %% There are location violations, but no node violations! + ?assertEqual(find_violations(NewRing, TargetN), []) + end}. + +location_seqclaim_t1_test() -> + JoiningNodes = + [{n2, loc1}, + {n3, loc2}, {n4, loc2}, + {n5, loc3}, {n6, loc3} + ], + location_claim_tester(n1, loc1, JoiningNodes, 64). + +location_claim_tester(N1, N1Loc, NodeLocList, RingSize) -> + location_claim_tester( + N1, N1Loc, NodeLocList, RingSize, choose_claim_v2, 4, 3), + location_claim_tester( + N1, N1Loc, NodeLocList, RingSize, sequential_claim, 4, 3). + +location_claim_tester( + N1, N1Loc, NodeLocList, RingSize, ClaimFun, TargetN, CheckerN) -> + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|NodeLocList], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RAll = + lists:foldl( + fun({N, L}, AccR) -> + AccR0 = riak_core_ring:add_member(N1, AccR, N), + riak_core_ring:set_node_location(N, L, AccR0) + end, + R1, + NodeLocList + ), + Params = + case ClaimFun of + sequential_claim -> + TargetN; + choose_claim_v2 -> + [{target_n_val, TargetN}] + end, + RClaim = + claim( + RAll, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_membership_claim, ClaimFun, Params}), + {RingSize, Mappings} = riak_core_ring:chash(RClaim), + + ?assertEqual( + [], + riak_core_location:check_ring(RClaim, TargetN, CheckerN) + ), + + check_for_failures(Mappings, TargetN, CheckerN, RClaim), + + RClaim2 = + claim( + riak_core_ring:set_node_location( + N1, + N1Loc, + RClaim), + {riak_core_membership_claim, default_wants_claim}, + {riak_core_membership_claim, ClaimFun, Params}), + {RingSize, Mappings2} = riak_core_ring:chash(RClaim2), + + check_for_failures(Mappings2, TargetN, CheckerN, RClaim2). + + +check_for_failures(Mappings, TargetN, CheckerN, RClaim) -> + NLs = riak_core_ring:get_nodes_locations(RClaim), + LocationMap = + lists:map( + fun({Idx, N}) -> + {Idx, riak_core_location:get_node_location(N, NLs)} + end, + Mappings), + Prefix = lists:sublist(LocationMap, TargetN - 1), + LastPart = + lists:sublist( + LocationMap, length(LocationMap) - (TargetN - 1), TargetN - 1), + CheckableMap = LocationMap ++ Prefix, + {_, Failures} = + lists:foldl( + fun({Idx, L}, {LastNminus1, Fails}) -> + case length(lists:usort([L|LastNminus1])) of + UniqL when UniqL < CheckerN -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], + [{Idx, L, LastNminus1}|Fails]}; + _ -> + {[L|lists:sublist(LastNminus1, TargetN - 2)], Fails} + end + end, + {LastPart, []}, + CheckableMap + ), + lists:foreach(fun(F) -> io:format("Failure ~p~n", [F]) end, Failures), + ?assert(length(Failures) == 0). + + -ifdef(EQC). @@ -1328,7 +1452,7 @@ choose_claim_v4(Ring, Node) -> choose_claim_v4(Ring, Node, Params). choose_claim_v4(Ring, Node, Params) -> - riak_core_claim_location:choose_claim_v4(Ring, Node, Params). + riak_core_claim_swapping:choose_claim_v4(Ring, Node, Params). %% NOTE: this is a less than adequate test that has been re-instated %% so that we don't leave the code worse than we found it. Work that @@ -1338,7 +1462,7 @@ choose_claim_v4(Ring, Node, Params) -> %% test re-instated to pass. prop_claim_ensures_unique_nodes_old(ChooseFun) -> %% NOTE: We know that this doesn't work for the case of {_, 3}. - ?FORALL({PartsPow, NodeCount}, {choose(4,9), choose(4,15)}, %{choose(4, 9), choose(4, 15)}, + ?FORALL({PartsPow, NodeCount}, {choose(4,9), choose(4,15)}, begin Nval = 3, TNval = Nval + 1, @@ -1386,7 +1510,7 @@ prop_claim_ensures_unique_nodes(ChooseFun) -> begin Nval = 3, TNval = Nval + 1, - _Params = [{target_n_val, TNval}], + Params = [{target_n_val, TNval}], Partitions = ?POW_2(PartsPow), [Node0 | RestNodes] = test_nodes(NodeCount), @@ -1396,7 +1520,7 @@ prop_claim_ensures_unique_nodes(ChooseFun) -> riak_core_ring:add_member(Node0, Racc, Node) end, R0, RestNodes), - Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun, Params}), Preflists = riak_core_ring:all_preflists(Rfinal, Nval), ImperfectPLs = orddict:to_list( @@ -1434,12 +1558,12 @@ prop_claim_ensures_unique_nodes_adding_groups(ChooseFun) -> %% NOTE2: uses undocumented "double_shrink", is expensive, but should get %% around those case where we shrink to a non-minimal case because %% some intermediate combinations of ring_size/node have no violations - ?FORALL({PartsPow, BaseNodes, AddedNodes}, + ?FORALL({PartsPow, BaseNodes, AddedNodes}, eqc_gen:double_shrink({choose(4, 9), choose(2, 10), choose(2, 5)}), begin Nval = 3, TNval = Nval + 1, - _Params = [{target_n_val, TNval}], + Params = [{target_n_val, TNval}], Partitions = ?POW_2(PartsPow), [Node0 | RestNodes] = test_nodes(BaseNodes), @@ -1453,12 +1577,12 @@ prop_claim_ensures_unique_nodes_adding_groups(ChooseFun) -> riak_core_ring:add_member(Node0, Racc, Node) end, R0, RestNodes), - Rinterim = claim(RBase, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + Rinterim = claim(RBase, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun, Params}), RAdded = lists:foldl(fun(Node, Racc) -> riak_core_ring:add_member(Node0, Racc, Node) end, Rinterim, AddNodes), - Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}), + Rfinal = claim(RAdded, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun, Params}), Preflists = riak_core_ring:all_preflists(Rfinal, Nval), ImperfectPLs = orddict:to_list( @@ -1509,9 +1633,7 @@ prop_claim_ensures_unique_nodes_adding_singly(ChooseFun) -> R0 = riak_core_ring:fresh(Partitions, Node0), Rfinal = lists:foldl(fun(Node, Racc) -> Racc0 = riak_core_ring:add_member(Node0, Racc, Node), - %% TODO which is it? Claim or ChooseFun?? - %%claim(Racc0, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun}) - ?MODULE:ChooseFun(Racc0, Node, Params) + claim(Racc0, {?MODULE, wants_claim_v2}, {?MODULE, ChooseFun, Params}) end, R0, RestNodes), Preflists = riak_core_ring:all_preflists(Rfinal, Nval), ImperfectPLs = orddict:to_list( @@ -1586,7 +1708,7 @@ prop_wants() -> ?LET(X, choose(1,16), trunc(math:pow(2, X)))}, begin R0 = riak_core_ring:fresh(Q, tnode(1)), - {_, R2, Active} = + {_, R2, Active} = lists:foldl( fun(S, {I, R1, A1}) -> N = tnode(I), @@ -1598,12 +1720,12 @@ prop_wants() -> end end, {1, R0, []}, NodeStatus), Wants = wants(R2), - + %% Check any non-claiming nodes are set to 0 %% Check all nodes are present - {ActiveWants, InactiveWants} = + {ActiveWants, InactiveWants} = lists:partition(fun({N,_W}) -> lists:member(N, Active) end, Wants), - + ActiveSum = lists:sum([W || {_,W} <- ActiveWants]), InactiveSum = lists:sum([W || {_,W} <- InactiveWants]), ?WHENFAIL( @@ -1619,7 +1741,7 @@ prop_wants() -> {active, equals(Q, ActiveSum)}, {inactive, equals(0, InactiveSum)}])) end). - + %% Large positive integer between 1 and Max large_pos(Max) -> ?LET(X, largeint(), 1 + (abs(X) rem Max)). @@ -1635,24 +1757,24 @@ prop_take_idxs() -> S = length(ExchangesSeed), Dup = roundup(S / length(OwnersSeed)), Owners = lists:flatten( - lists:duplicate(Dup, - [tnode(abs(OwnerSeed) rem S) || + lists:duplicate(Dup, + [tnode(abs(OwnerSeed) rem S) || OwnerSeed <- OwnersSeed])), Q = length(Owners), TN = 1+abs(TNSeed), - - + + Ownership0 = orddict:from_list([{tnode(I), []} || I <- lists:seq(0, S -1)]), Ownership = lists:foldl(fun({I,O},A) -> orddict:append_list(O, [I], A) - end, + end, Ownership0, lists:zip(lists:seq(0, Q-1), Owners)), NIs = [{Node, undefined, Owned} || {Node, Owned} <- Ownership], %% Generate claimable indices CIdxs = ordsets:from_list([abs(Idx) rem Q || Idx <- CIdxsSeed]), - + %% io:format(user, "ExchangesSeed (~p): ~p\n", [length(ExchangesSeed), %% ExchangesSeed]), %% io:format(user, "NIs (~p): ~p\n", [length(NIs), NIs]), @@ -1662,12 +1784,12 @@ prop_take_idxs() -> abs(GiveSeed) rem (length(OIdxs) + 1), % maximum indices to give abs(TakeSeed) rem (Q+1), % maximum indices to take CIdxs} || % indices that can be claimed by node - {{Node, _Want, OIdxs}, {GiveSeed, TakeSeed}} <- + {{Node, _Want, OIdxs}, {GiveSeed, TakeSeed}} <- lists:zip(NIs, ExchangesSeed)], %% Fire the test NIs2 = take_idxs(Exchanges, NIs, Q, TN), - + %% Check All nodes are still in NIs %% Check that no node lost more than it wanted to give ?WHENFAIL( diff --git a/src/riak_core_ring.erl b/src/riak_core_ring.erl index b79ef5c0a..300f96fbe 100644 --- a/src/riak_core_ring.erl +++ b/src/riak_core_ring.erl @@ -83,6 +83,7 @@ members/2, has_location_changed/1, clear_location_changed/1, + force_location_changed/2, set_node_location/3, get_nodes_locations/1, set_claimant/2, @@ -332,12 +333,16 @@ members(?CHSTATE{members=Members}, Types) -> -spec has_location_changed(chstate()) -> boolean(). has_location_changed(State) -> - {ok, Value} = get_meta('$nodes_locations_changed', false, State), - Value. + {ok, Value} = get_meta('$nodes_locations_changed', false, State), + Value. -spec clear_location_changed(chstate()) -> chstate(). clear_location_changed(State) -> - update_meta('$nodes_locations_changed', false, State). + update_meta('$nodes_locations_changed', false, State). + +-spec force_location_changed(chstate(), boolean()) -> chstate(). +force_location_changed(State, Boolean) -> + update_meta('$nodes_locations_changed', Boolean, State). -spec set_node_location(node(), string(), chstate()) -> chstate(). set_node_location(Node, Location, State) -> diff --git a/src/riak_core_send_msg.erl b/src/riak_core_send_msg.erl index 15d018795..84576dad9 100644 --- a/src/riak_core_send_msg.erl +++ b/src/riak_core_send_msg.erl @@ -31,8 +31,7 @@ -ifdef(PULSE). -compile(export_all). -compile({parse_transform, pulse_instrument}). --compile({pulse_replace_module, [{gen_fsm_compat, pulse_gen_fsm}, - {gen_server, pulse_gen_server}]}). +-compile({pulse_replace_module, [{gen_server, pulse_gen_server}]}). -endif. -endif. diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl new file mode 100644 index 000000000..389106dc1 --- /dev/null +++ b/test/riak_core_claim_eqc.erl @@ -0,0 +1,519 @@ +%%% @author Thomas Arts (thomas.arts@quviq.com) +%%% @doc QuickCheck model to replace riak_core_claim_statem +%%% by testing that part as well as testing location awareness +%%% (rack_awareness_test.erl) +%%% +%%% In reality each node has its own Ring structure. In this test we only build the ring structure +%%% for 1 claimant. +%%% +%%% We use the API as defined in riak_core_membership_claim. +%%% +%%% RUN WITH ./rebar3 as test eqc +%%% +%%% +%%% @end +%%% Created : 21 Feb 2023 by Thomas Arts + +-module(riak_core_claim_eqc). + +-include_lib("eqc/include/eqc.hrl"). +-include_lib("eqc/include/eqc_statem.hrl"). + +-compile([export_all, nowarn_export_all]). + + + +%% -- State ------------------------------------------------------------------ +-record(state, + { + ring_size, + placements = [] :: [{Name :: atom(), Location :: atom()}], %% AWS Partition placement groups + nodes = [] :: [Name :: atom()], %% all nodes that should be part of next plan + ring = undefined, + claimant = undefined :: atom(), + nval = 4, + committed_nodes = [], + staged_nodes = [] :: [Name :: atom()], %% nodes added/left before claim, + plan = [], %% staged nodes after claim + with_location = false + }). + +%% -- State and state functions ---------------------------------------------- +initial_state() -> + initial_state(3). + +initial_state(Nval) -> + #state{nval = Nval}. + +%% -- Generators ------------------------------------------------------------- + +%% -- Common pre-/post-conditions -------------------------------------------- +command_precondition_common(S, placements) -> + S#state.placements == []; +command_precondition_common(S, _Cmd) -> + S#state.placements /= []. + +%% -- Operations ------------------------------------------------------------- + +wrap_call(S, {call, Mod, Cmd, Args}) -> + try {ok, apply(Mod, Cmd, [S#state.ring | Args])} + catch + throw:{eqc, Reason, Trace} -> + {error, {'EXIT', Reason, Trace}, []}; + _:Reason:Trace -> {error, {'EXIT', Reason, Trace}, []} + end. + + +%% --- Operation: placements --- +%% Create nodes at specific locations +%% We assume AWS partition placement with P placements and N nodes per partition +%% Number of vnodes is determined later when ring_size is chosen to run on +%% this hardware +%% +%% we may want to add a location 'undefined' if we want to test that specific feature +placements_args(S) -> + [ locnodes(S#state.nval) ]. + +placements(_, _Primary) -> + ok. + +placements_next(S, _, [Primary]) -> + S#state{placements = Primary, + nodes = []}. + +placements_features(S, [Primary], _Res) -> + [{with_location, S#state.with_location}, + {nr_nodes, length(Primary)}]. + + +%% --- Operation: add_node --- +%% Make sure there is a non-started node to add. +add_claimant_pre(S) -> + S#state.claimant == undefined. + +add_claimant_args(S) -> + [hd(S#state.placements), S#state.with_location, ringsize()]. + +add_claimant_pre(S, [LocNode, _, RingSize]) -> + LocNodes = S#state.placements, + length(LocNodes) =< RingSize andalso + lists:member(LocNode, LocNodes). + +add_claimant(_, {Loc, Node}, WithLocation, RingSize) -> + NewRing = + pp(riak_core_ring, fresh, [RingSize, Node]), + case WithLocation of + true -> + pp(riak_core_ring, set_node_location, [Node, Loc, NewRing]); + false -> + NewRing + end. + +add_claimant_next(S, Ring, [{_, Node}, _, RingSize]) -> + S#state{ring = Ring, nodes = [Node], claimant = Node, ring_size = RingSize}. + +add_claimant_features(_S, [_, _WithLocation, RingSize], _Res) -> + [{ring_size, RingSize}]. + + + +%% --- Operation: add_node --- +%% Make sure there is a non-started node to add. +add_node_pre(S) -> + S#state.claimant /= undefined + andalso S#state.plan == [] + andalso length(S#state.nodes) < length(S#state.placements). + +add_node_args(S) -> + ?LET(NewNode, elements([ {Loc, Node} || {Loc, Node} <- S#state.placements, + not lists:member(Node, S#state.nodes) ]), + [NewNode, S#state.with_location, S#state.claimant]). + +add_node_pre(S, [{Loc, Node}, _, Claimant]) -> + not lists:member(Node, S#state.nodes) andalso S#state.claimant == Claimant andalso + lists:member({Loc, Node}, S#state.placements). + +add_node(Ring, {Loc, Node}, WithLocation, Claimant) -> + NewRing = + pp(riak_core_ring, add_member, [Claimant, Ring, Node]), + case WithLocation of + true -> + pp(riak_core_ring, set_node_location, [Node, Loc, NewRing]); + false -> + NewRing + end. + +add_node_next(S=#state{nodes=Nodes}, Ring, [{_, Node}, _, _]) -> + S#state{ring = Ring, nodes = [Node | Nodes], + staged_nodes = stage(S#state.staged_nodes, Node, add)}. + +add_node_post(_S, [{_Loc, Node}, _, _Claimant], NextRing) -> + lists:member(Node, riak_core_ring:members(NextRing, [joining])). + + + +%% --- Operation: claim --- + +%% @doc claim_pre/3 - Precondition for generation +claim_pre(S) -> + S#state.ring /= undefined andalso length(S#state.nodes) >= S#state.nval + andalso necessary_conditions(S) + andalso S#state.plan == [] andalso S#state.staged_nodes /= []. %% make sure there is something sensible to do + +claim_args(S) -> + [elements([v4]), S#state.nval]. + +claim(Ring, default, Nval) -> + pp(riak_core_membership_claim, claim, [Ring, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_membership_claim, sequential_claim, Nval}]); +claim(Ring, v2, Nval) -> + pp(riak_core_membership_claim, claim, [Ring, + {riak_core_membership_claim, wants_claim_v2}, + {riak_core_membership_claim, choose_claim_v2, [{target_n_val, Nval}]}]); +claim(Ring, v4, Nval) -> + pp(riak_core_membership_claim, claim, [Ring, + {riak_core_membership_claim, wants_claim_v2}, + {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]). + +claim_pre(S, [v4, _Nval]) -> + not known_hard(S) andalso (length(S#state.nodes) < S#state.ring_size div 2); +claim_pre(_, [_, _]) -> + true. + + + +claim_next(S, NewRing, [_, _]) -> + S#state{ring = NewRing, plan = S#state.staged_nodes, staged_nodes = []}. + +claim_post(#state{nval = Nval, ring_size = RingSize, nodes = Nodes} = S, [_, _], NewRing) -> + Preflists = riak_core_ring:all_preflists(NewRing, Nval), + LocNval = if Nval > 3 -> Nval - 1; + true -> Nval end, + ImperfectPLs = + lists:foldl(fun(PL,Acc) -> + PLNodes = lists:usort([N || {_,N} <- PL]), + case length(PLNodes) of + Nval -> + Acc; + _ -> + [PL | Acc] + end + end, [], Preflists), + ImperfectLocations = + lists:foldl(fun(PL,Acc) -> + PLLocs = lists:usort([location(S, N) || {_, N} <- PL]) -- [undefined], + case length(PLLocs) of + LocNval -> + Acc; + Nval -> + Acc; + _ when S#state.with_location -> + [{PLLocs, PL} | Acc]; + _ -> + Acc + end + end, [], Preflists), + RiakRingSize = riak_core_ring:num_partitions(NewRing), + RiakNodeCount = length(riak_core_ring:members(NewRing, [joining, valid])), + + BalancedRing = + riak_core_membership_claim:balanced_ring(RiakRingSize, + RiakNodeCount, NewRing), + %% S#state.committed_nodes == [] orelse + eqc_statem:conj( + [eqc_statem:tag(ring_size, eq(RiakRingSize, RingSize)), + eqc_statem:tag(node_count, eq(RiakNodeCount, length(Nodes))), + eqc_statem:tag(meets_target_n, eq(riak_core_membership_claim:meets_target_n(NewRing, Nval), {true, []})), + eqc_statem:tag(correct_nodes, eq(chash:members(riak_core_ring:chash(NewRing)), lists:sort(Nodes))), + eqc_statem:tag(perfect_pls, eq(ImperfectPLs, [])), + eqc_statem:tag(perfect_locations, eq(ImperfectLocations, [])), + %% eqc_statem:tag(few_moves, length(S#state.committed_nodes) =< 1 orelse length(diff_nodes(S#state.ring, NewRing)) < S#state.ring_size div 2), + eqc_statem:tag(balanced_ring, BalancedRing)]). + +claim_features(#state{nodes = Nodes} = S, [Alg, _], Res) -> + [{claimed_nodes, length(Nodes)}, + {algorithm, Alg}] ++ + %% and if we add to an already claimed ring + [{moving, {S#state.ring_size, S#state.nval, + {joining, length(S#state.nodes -- S#state.committed_nodes)}, + {leaving, length(S#state.committed_nodes -- S#state.nodes)}, + {moves, length(diff_nodes(S#state.ring, Res))}}} + || length(S#state.committed_nodes) > 1]. + + +diff_nodes(Ring1, Ring2) -> + %% get the owners per vnode hash + {Rest, Differences} = + lists:foldl(fun({Hash2, Node2}, {[{Hash1, Node1} | HNs], Diffs}) when Hash1 == Hash2 -> + case Node1 == Node2 of + true -> {HNs, Diffs}; + false -> {HNs, [{vnode_moved, Hash1, Node1, Node2}|Diffs]} + end; + ({Hash2, _}, {[{Hash1, Node1} | HNs], Diffs}) when Hash1 < Hash2 -> + {HNs, [{vnode_split, Hash1, Node1}|Diffs]}; + ({Hash2, Node2}, {[{Hash1, Node1} | HNs], Diffs}) when Hash1 > Hash2 -> + {[{Hash1, Node1} | HNs], [{vnode_diff, Hash2, Node2}|Diffs]} + end, {riak_core_ring:all_owners(Ring1), []}, riak_core_ring:all_owners(Ring2)), + [{only_left, E} || E <- Rest] ++ Differences. + + +%% necessary for a solution to exist: +necessary_conditions(S) when not S#state.with_location -> + Remainder = S#state.ring_size rem length(S#state.nodes), + Remainder == 0 orelse Remainder >= S#state.nval; +necessary_conditions(S) -> + Locations = to_config(S), + Rounds = S#state.ring_size div S#state.nval, + NumNodes = length(S#state.nodes), + MinOccs = S#state.ring_size div NumNodes, + MaxOccs = + if S#state.ring_size rem NumNodes == 0 -> MinOccs; + true -> 1 + MinOccs + end, + + MinOccForLocWith = + fun(N) -> max(N * MinOccs, S#state.ring_size - MaxOccs * lists:sum(Locations -- [N])) end, + MaxOccForLocWith = + fun(N) -> min(N * MaxOccs, S#state.ring_size - MinOccs * lists:sum(Locations -- [N])) end, + + lists:all(fun(B) -> B end, + [length(Locations) >= S#state.ring_size || Rounds < 2] ++ + [length(Locations) >= S#state.nval] ++ + [ MinOccForLocWith(Loc) =< Rounds || Loc <- Locations ] ++ + [ Rounds =< MaxOccForLocWith(LSize) + || S#state.nval == length(Locations), LSize <- Locations] ++ + [ S#state.ring_size rem S#state.nval == 0 + || S#state.nval == length(Locations) ] + ). + +to_config(S) -> + LocNodes = + lists:foldl(fun(N, Acc) -> + Loc = location(S, N), + Acc#{Loc => maps:get(Loc, Acc, 0) + 1} + end, #{}, S#state.nodes), + maps:values(LocNodes). + + + + + +%% --- Operation: leave_node --- +leave_node_pre(S) -> + length(S#state.nodes) > 1 andalso S#state.committed_nodes/= []. %% try > 1 not to delete the initial node + +leave_node_args(S) -> + %% TODO consider re-leaving leaved nodes + [elements(S#state.nodes), + S#state.with_location, + S#state.claimant]. + +leave_node_pre(#state{nodes=_Nodes}, [Node, _, Claimant]) -> + Claimant /= Node. %% andalso lists:member(Node, Nodes). + +leave_node(Ring, NodeName, _WithLocation, Claimant) -> + pp(riak_core_ring, leave_member, [Claimant, Ring, NodeName]). + +leave_node_next(S=#state{nodes = Nodes}, NewRing, [Node, _, _]) -> + S#state{ring = NewRing, nodes = lists:delete(Node, Nodes), + staged_nodes = stage(S#state.staged_nodes, Node, leave)}. + +leave_node_post(_S, [NodeName, _, _], NextRing) -> + lists:member(NodeName, riak_core_ring:members(NextRing, [leaving])). + +%% --- Operation: commit --- + +%% In the code this is an involved process with gossiping and transfering data. +%% Here we just assume that all works out fine and make joining nodes valid nodes +%% in the result of the planned new ring. +%% In other words, we assume that the plan is established and only update +%% the joining nodes to valid. + +commit_pre(S) -> + S#state.plan /= []. + +commit_args(S) -> + [S#state.claimant]. + +commit(Ring, Claimant) -> + JoiningNodes = riak_core_ring:members(Ring, [joining]), %% [ Node || {Node, joining} <- riak_core_ring:all_member_status(Ring) ], + lists:foldl(fun(Node, R) -> + riak_core_ring:set_member(Claimant, R, Node, valid, same_vclock) + end, Ring, JoiningNodes). + +commit_next(S, NewRing, [_]) -> + S#state{ring = NewRing, staged_nodes = [], plan = [], committed_nodes = S#state.nodes}. + +commit_post(#state{nodes = Nodes}, [_], Ring) -> + eq(Nodes -- riak_core_ring:members(Ring, [valid]), []). + + + + + +weight(S, add_node) when not S#state.with_location -> + 1 + 4*(length(S#state.placements) - length(S#state.nodes)); +weight(S, add_located_nodes) when S#state.with_location-> + 0; +weight(S, leave_node) -> + 1 + (length(S#state.committed_nodes) div 4); +weight(_S, _Cmd) -> 1. + + + +%% --- ... more operations + +%% -- Property --------------------------------------------------------------- + +prop_claim() -> + case ets:whereis(timing) of + undefined -> ets:new(timing, [public, named_table, bag]); + _ -> ok + end, + ?FORALL({Nval, WithLocation}, {choose(2, 5), bool()}, + ?FORALL(Cmds, commands(?MODULE, with_location(initial_state(Nval), WithLocation)), + begin + put(ring_nr, 0), + {H, S, Res} = run_commands(Cmds), + measure(length, commands_length(Cmds), + aggregate_feats([claimed_nodes, ring_size, with_location, nr_nodes, moving, algorithm], + call_features(H), + check_command_names(Cmds, + pretty_commands(?MODULE, Cmds, {H, S, Res}, + Res == ok)))) + end)). + +aggregate_feats([], _, Prop) -> Prop; +aggregate_feats([Op | Ops], Features, Prop) -> + aggregate(with_title(Op), + [F || {Id, F} <- Features, Id == Op], + aggregate_feats(Ops, Features, Prop)). + + +with_location(S, Bool) -> + S#state{with_location = Bool}. + +location(S, N) when is_record(S, state) -> + location(S#state.placements, N); +location(LocNodes, N) -> + case lists:keyfind(N, 2, LocNodes) of + {Loc, _} -> Loc; + _ -> exit({not_found, N, LocNodes}) + end. + +bugs() -> bugs(10). + +bugs(N) -> bugs(N, []). + +bugs(Time, Bugs) -> + more_bugs(eqc:testing_time(Time, prop_claim()), 20, Bugs). + +locnodes(Nval) -> + ?LET(MaxLoc, choose(Nval, Nval * 2), configs(MaxLoc, Nval)). + +ringsize() -> + ?LET(Exp, choose(5, 8), + power2(Exp)). + +configs(MaxLocations, Nval) when MaxLocations < Nval -> + {error, too_few_locations}; +configs(MaxLocations, Nval) -> + ?LET(Max, choose(1, 8), + ?LET(Less, vector(MaxLocations - Nval, choose(1, Max)), + to_locnodes([Max || _ <- lists:seq(1, Nval)] ++ Less))). + +to_locnodes(NodeList) -> + NodesSet = [ list_to_atom(lists:concat([n, Nr])) || Nr <- lists:seq(1, lists:sum(NodeList))], + LocationsSet = [ list_to_atom(lists:concat([loc, Nr])) || Nr <- lists:seq(1, length(NodeList))], + to_locnodes(lists:zip(LocationsSet, NodeList), NodesSet). + +to_locnodes([], []) -> + []; +to_locnodes([{Loc, Nr}| LocNrs], NodesSet) -> + Nodes = lists:sublist(NodesSet, Nr), + [ {Loc, Node} || Node <- Nodes ] ++ + to_locnodes(LocNrs, NodesSet -- Nodes). + +stage(Staged, Node, Kind) -> + lists:keydelete(Node, 1, Staged) ++ [{Node, Kind}]. + + +power2(0) -> + 1; +power2(1) -> + 2; +power2(N) when N rem 2 == 0 -> + X = power2(N div 2), + X*X; +power2(N) -> + 2*power2(N-1). + + +pp(M, F, As) -> + _Call = lists:flatten([io_lib:format("~p:~p(", [M, F])] ++ + string:join([as_ring(arg, A) || A <-As], ",") ++ + [")."]), + try {Time, R} = timer:tc(M, F, As), + %% eqc:format("~s = ~s\n", [as_ring(res, R), _Call ]), + if Time > 150000 -> ets:insert(timing, [{F, As, Time}]); + true -> ok + end, + R + catch _:Reason:ST -> + eqc:format("~s \n", [ _Call ]), + throw({eqc, Reason, ST}) + end. + +as_ring(Kind, Term) when is_tuple(Term) -> + case element(1, Term) of + chstate_v2 -> + case Kind of + arg -> + OldRing = get(ring_nr), + lists:concat(["Ring_",OldRing]); + res -> + OldRing = put(ring_nr, get(ring_nr) + 1), + lists:concat(["Ring_",OldRing + 1]) + end; + _ -> lists:flatten(io_lib:format("~p", [Term])) + end; +as_ring(_, Term) -> + lists:flatten(io_lib:format("~p", [Term])). + + +known_hard(S) -> + lists:member({S#state.ring_size, lists:sort(to_config(S)), S#state.nval}, + [{16, [1,1,1,2,2,2], 5}, + {16, [1,1,1,1,3,3], 5}, + {16, [1,1,1,3,3], 4}, + {16, [1,1,1,1,2,3], 4}, + {16, [1,1,4,4], 3}, + {16, [1,2,2,2,3], 4}, + {128, [1,1,2,2,2,2], 5} + ]). + + + +equal({X, Ls1}, {X, Ls2}) -> + equal(Ls1, Ls2); +equal([X|Ls1], [X|Ls2]) -> + equal(Ls1, Ls2); +equal(X, Y) -> + equals(X, Y). + +%% create a config and add or leave nodes for next config +prop_translate() -> + ?FORALL(LocNodes1, ?LET(X, configs(4, 2), shuffle(X)), + ?FORALL([LocNodesA, LocNodesL], vector(2, ?LET(C, configs(4,2), sublist(C))), + ?FORALL(LocNodes2, shuffle((LocNodes1 -- LocNodesL) ++ LocNodesA), + begin + Leaving = [ N || {L, N} <- LocNodes1, not lists:member({L, N}, LocNodes2)], + {_R, OldLocRel} = riak_core_claim_swapping:to_binring(LocNodes1, Leaving), + StayTheSame = [ {Idx, {L, N}} || {Idx, {L, N}} <- OldLocRel, not lists:member(N, Leaving) ], + {_Config, NewLocRel} = riak_core_claim_swapping:to_config2(LocNodes2, OldLocRel), + equals([ Same + || {Idx1, {L,N}} = Same <- StayTheSame, + {Idx2, _} <- [lists:keyfind({L,N}, 2, NewLocRel)], + Idx1 == Idx2], StayTheSame) + end))). From 278ffa405a7565ab68210ca218f2734d6d5d078f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 3 May 2023 12:57:57 +0100 Subject: [PATCH 11/30] Claim API requires export of 2-arity choose function as well as 3-arity --- src/riak_core_claim_swapping.erl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 057778af5..6f9e09450 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -47,7 +47,7 @@ -module(riak_core_claim_swapping). -export([claim/1, claim/2, - choose_claim_v4/3]). + choose_claim_v4/3, choose_claim_v4/2]). -ifdef(TEST). -export([to_binring/2, to_config2/2]). @@ -62,6 +62,11 @@ choose_claim_v4(Ring, _Node, Params) -> claim(Ring, Params). +-spec choose_claim_v4(riak_core_ring:riak_core_ring(), node()) -> + riak_core_ring:riak_core_ring(). +choose_claim_v4(Ring, _Node) -> + claim(Ring). + -spec claim(riak_core_ring:riak_core_ring()) -> riak_core_ring:riak_core_ring(). claim(Ring) -> Params = riak_core_membership_claim:default_choose_params(), From e13f4e0897aea9b9a5cd6fa68eafd900cb2c189d Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 9 May 2023 09:55:11 +0100 Subject: [PATCH 12/30] Always return indices Otherwise, if all vnodes have become excluded there is no escape from this condition (unless other traffic can trigger the creation of vnodes). This is helpful in situations where transfers are performed on standby clusters with no other traffic. This commit also logs a timing of the claim each time it is called. --- src/riak_core_claimant.erl | 5 +++++ src/riak_core_ring_handler.erl | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index f82cc324e..38aa60468 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -1459,7 +1459,12 @@ rebalance_ring(CState) -> rebalance_ring(Next, CState). rebalance_ring([], CState) -> + SW = os:timestamp(), CState2 = riak_core_membership_claim:claim(CState), + lager:info( + "Claim algorithm request completed in claim_time=~w ms", + [timer:now_diff(os:timestamp(), SW) div 1000] + ), Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CState2), Owners3 = lists:zip(Owners1, Owners2), diff --git a/src/riak_core_ring_handler.erl b/src/riak_core_ring_handler.erl index a13a4af61..a752f7d05 100644 --- a/src/riak_core_ring_handler.erl +++ b/src/riak_core_ring_handler.erl @@ -178,12 +178,7 @@ startable_vnodes(Mod, Ring) -> Excl = ModExcl -- riak_core_ring:disowning_indices(Ring, node()), case riak_core_ring:random_other_index(Ring, Excl) of no_indices -> - case length(Excl) =:= riak_core_ring:num_partitions(Ring) of - true -> - []; - false -> - riak_core_ring:my_indices(Ring) - end; + riak_core_ring:my_indices(Ring); RO -> [RO | riak_core_ring:my_indices(Ring)] end From e5083f122169b7904c970e1289604330d9e25584 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 9 May 2023 12:35:04 +0100 Subject: [PATCH 13/30] Calculate swaps only once --- src/riak_core_claim_binring_alg.erl | 56 ++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index 640b02780..d73515cb1 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -311,23 +311,34 @@ brute_force(Ring, NVals, Options) -> brute_force(Ring, NVals, Options, violations(Ring, NVals))). brute_force(Ring, NVals, Options, V) -> + brute_force(Ring, NVals, Options, V, false). + +brute_force(Ring, NVals, Options, V, OrigSwaps) -> TryHard = proplists:get_bool(try_hard, Options), case V of _ when not TryHard, ?is_zero_v(V) -> Ring; ?zero_v -> Ring; _ -> N = ring_size(Ring), - %% TODO: keep swaps so we don't start over every time (earlier swaps are less likely to work) - Swaps = [ {swap, I, J} || I <- lists:seq(0, N - 2), J <- lists:seq(I, N - 1) ] ++ - lists:sort(fun({move, I1, J1}, {move, I2, J2}) -> abs(I1 - J1) =< abs(I2 - J2) end, - [ {move, I, J} || not proplists:get_bool(only_swap, Options) - , I <- lists:seq(0, N - 1), J <- lists:seq(0, N - 1) - , D <- [mod_dist(J, I, N)] - , D > 2 orelse D < -1 %% Moving just one step is a swap - ]), - brute_force(Ring, NVals, V, Options, Ring, ?zero_v, Swaps) + Swaps = + case OrigSwaps of + false -> + generate_swaps(N, Options); + OrigSwaps when is_list(OrigSwaps) -> + OrigSwaps + end, + brute_force(Ring, NVals, V, Options, Ring, ?zero_v, Swaps, Swaps) end. +generate_swaps(N, Options) -> + [ {swap, I, J} || I <- lists:seq(0, N - 2), J <- lists:seq(I, N - 1) ] ++ + lists:sort(fun({move, I1, J1}, {move, I2, J2}) -> abs(I1 - J1) =< abs(I2 - J2) end, + [ {move, I, J} || not proplists:get_bool(only_swap, Options) + , I <- lists:seq(0, N - 1), J <- lists:seq(0, N - 1) + , D <- [mod_dist(J, I, N)] + , D > 2 orelse D < -1 %% Moving just one step is a swap + ]). + mod_dist(I, J, N) -> D = (J - I + N) rem N, if D * 2 > N -> D - N; @@ -335,20 +346,20 @@ mod_dist(I, J, N) -> end. %% TODO: Don't use DeltaV for BestV (total violations instead) -brute_force(_Ring, NVals, V, Options, Best, BestV, []) when BestV < ?zero_v -> +brute_force(_Ring, NVals, V, Options, Best, BestV, [], OrigSwaps) when BestV < ?zero_v -> ?debug("~s\n", [show(Best, NVals)]), - brute_force(Best, NVals, Options, add_v(V, BestV)); -brute_force(_Ring, _NVals, _V, _Options, Best, _BestV, []) -> Best; -brute_force(Ring, NVals, V, Options, Best, BestV, [Op | Swaps]) -> + brute_force(Best, NVals, Options, add_v(V, BestV), OrigSwaps); +brute_force(_Ring, _NVals, _V, _Options, Best, _BestV, [], _OrigSwaps) -> Best; +brute_force(Ring, NVals, V, Options, Best, BestV, [Op | Swaps], OrigSwaps) -> {Ring1, DV} = op(Ring, NVals, Op), TryHard = proplists:get_bool(try_hard, Options), if DV < ?zero_v, not TryHard -> ?debug("~s\n", [show(Ring1, NVals)]), - brute_force(Ring1, NVals, Options, add_v(V, DV)); + brute_force(Ring1, NVals, Options, add_v(V, DV), OrigSwaps); DV < BestV -> - brute_force(Ring, NVals, V, Options, Ring1, DV, Swaps); + brute_force(Ring, NVals, V, Options, Ring1, DV, Swaps, OrigSwaps); true -> - brute_force(Ring, NVals, V, Options, Best, BestV, Swaps) + brute_force(Ring, NVals, V, Options, Best, BestV, Swaps, OrigSwaps) end. op(Ring, NVals, {swap, I, J}) -> @@ -561,6 +572,19 @@ show_update(RingSize, OldConfig, NewConfig, NVals) -> -ifdef(TEST). +generate_swaps_test() -> + time_generating_swaps(32), + time_generating_swaps(128), + time_generating_swaps(1024). + +time_generating_swaps(N) -> + SW = os:timestamp(), + Swaps = generate_swaps(N, []), + io:format( + user, + "Generate swaps for RS ~w in ~w ms length ~w~n", + [N, timer:now_diff(os:timestamp(), SW) div 1000, length(Swaps)]). + %% -- Unit tests for experimentation --------------------------------------- %% These tests take a bit of time when running. %% Not intended to be included in automatic testing. From 6820c7b3c3f7ddf352810dd7373c7bf4b8be33dc Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 11 May 2023 10:11:21 +0100 Subject: [PATCH 14/30] Remember v4 solutions via claimant To allow for the riak_core_ring_manager and riak_core_claimant to remember v4 solutions, they are shared via the state of the claimant --- src/riak_core_claim_swapping.erl | 54 ++++++++++++++++++++++++++++--- src/riak_core_claimant.erl | 55 +++++++++++++++++++++++--------- 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 6f9e09450..6856ad64d 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -56,6 +56,28 @@ %% The algorithm does not use any wants claim logic. %% For backward compatibility one can combine wants_claim_v2 with the choose here +-spec memoize( + binring_solve|binring_update, + {binary()|pos_integer(), + list(pos_integer()), + {pos_integer(), pos_integer()}}, + fun(() -> binary())) -> binary(). +memoize(Registry, Key, Fun) -> + V4Solutions = + case get(v4_solutions) of + undefined -> []; + V4SL -> V4SL + end, + case lists:keyfind({Registry, Key}, 1, V4Solutions) of + {{Registry, Key}, Solution} -> + lager:info("Retrieved solve from memory for ~w", [Registry]), + Solution; + _ -> + Value = Fun(), + riak_core_claimant:update_v4_solutions({{Registry, Key}, Value}), + Value + end. + %% Backward compatible interface -spec choose_claim_v4(riak_core_ring:riak_core_ring(), node(), [{atom(), term()}]) -> riak_core_ring:riak_core_ring(). @@ -97,16 +119,40 @@ claim(Ring, Params0) -> {BinRing0, OldLocRel} = to_binring(Ring), {Config, LocRel} = to_config(Ring, OldLocRel), - %% io:format("Config = ~p RingSize ~p nval ~p\n", [Config, RingSize, NVals]), - BinRing1 = riak_core_claim_binring_alg:update(BinRing0, Config, NVals), + SWup = os:timestamp(), + BinRing1 = + memoize( + binring_update, + {BinRing0, Config, NVals}, + fun() -> + riak_core_claim_binring_alg:update(BinRing0, Config, NVals) + end), + lager:info( + "~w Swapping algorithm update in ~w ms Config ~w NVals ~w h(BinRing) ~w", + [self(), timer:now_diff(os:timestamp(), SWup) div 1000, + Config, NVals, erlang:phash2(BinRing1)] + ), + SWsv = os:timestamp(), + UpdateSolved = + riak_core_claim_binring_alg:zero_violations(BinRing1, NVals), BinRing = - case riak_core_claim_binring_alg:zero_violations(BinRing1, NVals) of + case UpdateSolved of false -> - riak_core_claim_binring_alg:solve(RingSize, Config, NVals); + memoize( + binring_solve, + {RingSize, Config, NVals}, + fun() -> + riak_core_claim_binring_alg:solve(RingSize, Config, NVals) + end); true -> BinRing1 end, + lager:info( + "~w Swapping algorithm solve in ~w ms as solve_required=~w", + [self(), timer:now_diff(os:timestamp(), SWsv) div 1000, + not UpdateSolved] + ), Inc = chash:ring_increment(RingSize), SolvedNodes = diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index 38aa60468..a9debc7fe 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -43,7 +43,8 @@ get_bucket_type/2, get_bucket_type/3, bucket_type_iterator/0, - set_node_location/2]). + set_node_location/2, + update_v4_solutions/1]). -export([reassign_indices/1]). % helpers for claim sim %% gen_server callbacks @@ -62,21 +63,32 @@ %% {Ring, NewRing} where NewRing = f(Ring) -type ring_transition() :: {riak_core_ring(), riak_core_ring()}. --record(state, { - last_ring_id, - %% The set of staged cluster changes - changes :: [{node(), action()}], - - %% Ring computed during the last planning stage based on - %% applying a set of staged cluster changes. When commiting - %% changes, the computed ring must match the previous planned - %% ring to be allowed. - next_ring :: riak_core_ring()|undefined, +-type v4_solution() :: + {{binring_solve|binring_update, + {binary()|pos_integer(), + list(pos_integer()), + {pos_integer(), pos_integer()}}}, + binary()}. - %% Random number seed passed to remove_node to ensure the - %% current randomized remove algorithm is deterministic - %% between plan and commit phases - seed}). +-record(state, { + last_ring_id, + %% The set of staged cluster changes + changes :: [{node(), action()}], + + %% Ring computed during the last planning stage based on + %% applying a set of staged cluster changes. When commiting + %% changes, the computed ring must match the previous planned + %% ring to be allowed. + next_ring :: riak_core_ring()|undefined, + + %% Random number seed passed to remove_node to ensure the + %% current randomized remove algorithm is deterministic + %% between plan and commit phases + seed, + + %% List of v4 solutions - to be copied to the process memory of + %% the riak_core_ring_manager when + v4_solutions = [] :: list(v4_solution())}). -define(ROUT(S,A),ok). %%-define(ROUT(S,A),?debugFmt(S,A)). @@ -117,6 +129,10 @@ plan() -> commit() -> gen_server:call(claimant(), commit, infinity). +-spec update_v4_solutions(v4_solution()) -> ok. +update_v4_solutions(V4Solution) -> + gen_server:cast(?MODULE, {update_v4_solutions, V4Solution}). + %% @doc Stage a request for `Node' to leave the cluster. If committed, `Node' %% will handoff all of its data to other nodes in the cluster and then %% shutdown. @@ -345,6 +361,12 @@ handle_call(_Request, _From, State) -> Reply = ok, {reply, Reply, State}. +handle_cast({update_v4_solutions, V4Solution}, State) -> + {noreply, + State#state{ + v4_solutions = + lists:ukeysort(1, [V4Solution|State#state.v4_solutions]) + }}; handle_cast(_Msg, State) -> {noreply, State}. @@ -397,6 +419,7 @@ generate_plan([], _, State) -> %% There are no changes to apply {{ok, [], []}, State}; generate_plan(Changes, Ring, State=#state{seed=Seed}) -> + put(v4_solutions, State#state.v4_solutions), case compute_all_next_rings(Changes, Seed, Ring) of {error, invalid_resize_claim} -> {{error, invalid_resize_claim}, State}; @@ -432,10 +455,12 @@ maybe_commit_staged(State) -> %% @private maybe_commit_staged(Ring, State=#state{changes=Changes, seed=Seed}) -> Changes2 = filter_changes(Changes, Ring), + put(v4_solutions, State#state.v4_solutions), case compute_next_ring(Changes2, Seed, Ring) of {error, invalid_resize_claim} -> {ignore, invalid_resize_claim}; {ok, NextRing} -> + erase(v4_solutions), maybe_commit_staged(Ring, NextRing, State) end. From 6527033e246367cb7d97a3059ccbf9caff4a107c Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 15 May 2023 13:53:54 +0100 Subject: [PATCH 15/30] Long-running tests --- src/riak_core_claim_swapping.erl | 36 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 6856ad64d..f43f5eb35 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -287,12 +287,11 @@ location_t1_test_() -> {"[2, 2, 2, 2, 2] nval 4", {inparallel, [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), - location_claim_tester(n1, loc1, JoiningNodes, 128, 4), - location_claim_tester(n1, loc1, JoiningNodes, 256, 4) + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4), %% Don't test large rings in automated testing - %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) - %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) ]}}. location_t2_test_() -> @@ -308,8 +307,7 @@ location_t2_test_() -> location_claim_tester(n1, loc1, JoiningNodes, 128, 4), location_claim_tester(n1, loc1, JoiningNodes, 256, 4), location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), - location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) ]}}. location_t8_test_() -> @@ -321,16 +319,15 @@ location_t8_test_() -> {"[4, 3, 3, 3] nval 4", {inparallel, [location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) - %% Don't test large rings in automated testing - %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) + %% Don't test large rings in automated testing + %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) ]}}. location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> {"Ringsize "++integer_to_list(RingSize), - {timeout, 120, + {timeout, 600, fun() -> io:format( "Testing NodeList ~w with RingSize ~w~n", @@ -391,7 +388,6 @@ compute_failures(Mappings, TargetN, RClaim) -> Failures. - location_multistage_t1_test_() -> %% This is a tricky corner case where we would fail to meet TargetN for %% locations if joining all 9 nodes in one claim (as old sequential_claim will @@ -408,9 +404,8 @@ location_multistage_t1_test_() -> location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4) - %% location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) - %% location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) + location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) ]}. @@ -496,12 +491,13 @@ location_typical_expansion_test_() -> {inparallel, [location_typical_expansion_tester(64), location_typical_expansion_tester(128), - location_typical_expansion_tester(256) - %% location_typical_expansion_tester(512) + location_typical_expansion_tester(256), + location_typical_expansion_tester(512), + location_typical_expansion_tester(1024) ]}}. location_typical_expansion_tester(RingSize) -> - {timeout, 120, + {timeout, 600, {"Ringsize "++integer_to_list(RingSize), fun() -> N1 = l1n1, From 0da5c68eb2367f483146ec1f7b621a376044e721 Mon Sep 17 00:00:00 2001 From: Thomas Arts Date: Wed, 17 May 2023 13:51:36 +0200 Subject: [PATCH 16/30] Adding an extra test (#1004) * Add an extra test to show owners may stay the same if only location changes This is not always the case, but holds when there is a solution in the first place * Fix type error that dialyzer could not find * Introduce necessary conditions to fallback to version 2 * update tests * Check whether it is worth to use brute force * make historic values the norm * Introduce nvals map type * Take nr nodes into account when checking for brute force cond. * Property to evaluate skipping brute force strategy * QuickCheck property starts with choosing ring size. * Remove fallback for necessary conditions * Filter tests to get away with flakyness * In order to re-run tests suite, remove strict precondition * Check in test suite * Replace claim_suite.suite by larger claim.suite * Sometimes it is worth to brute_force to a zero node violation * better documentation binring algorithm * Run property with a sufficient condition --- claim.suite | Bin 0 -> 347393 bytes src/riak_core_claim_binring_alg.erl | 248 +++++++++++++++++++--------- src/riak_core_claim_swapping.erl | 109 +++++++----- src/riak_core_claimant.erl | 6 +- test/riak_core_claim_eqc.erl | 59 ++++--- 5 files changed, 276 insertions(+), 146 deletions(-) create mode 100644 claim.suite diff --git a/claim.suite b/claim.suite new file mode 100644 index 0000000000000000000000000000000000000000..c4e685902a07612cec60c4b41341ce4c5767aaa9 GIT binary patch literal 347393 zcmeHQ-E!N;k|qhsNgU^Y&#ArGt>fC-y^mxkaqj#kE`(%Ck|ipU6<@n*Uw9v3d%A%M z0Su?xJpere_FibumtXxmxBA_7bGuk9{_kq#{@?Pd zs-M4ItX8Xfb>seE1%I%rpKlgtj}L!s-Jg8!E^vN#zuDey7C$^(*Kgmf@8MGG+xq(M z&Gyf=`~2*Bcl~(vCH&&M$NF(wKfk%zEWY?_x7q%Fz1wc~|2OvYwq9@Vw!6o7_z!SD&hQp)@7EWb#cFra8C|xcOZVA} z&gkXM&EO7J>>WI-Hyirb*Skme*H`UD>9248ZSV{R7q8fhf9@{6a}RrPVfY8&L&JaL z@Y-j}YtwUyU;8}e+ArhRe!^b+9H*LlATUidfD2xh#4DpLb_aXB3^l?Qgzt zlj0|v&iCsnr!zg}iKp}HAsoMM{z|t5RB-&-;JAK0giM3u;WJ5wW0=1KI4*1gmjK5$ zo!3W!;}1hP{*VgC9}JG`4@2lNI37NeWH^TTJAmWDCU6OGY}5HDaQquY7K;HQmmeuZ;QUW_ksHV-I@#!+zwl;qrXRe1cd3+`e zMSR9)+u>`|?1%@ApR?B<G%PnKng2&ekb@hOwz5|uy;Z-T?i zsK_Nc(B5CL3;oIZ{_gG?YmrOGD;L8#`;@)N)7x$Re*1Ur(Ou}*_{UXy)}87-Tm$U^1;Inw!+@etx7_ES6x%~5+uG}B3G4@;nWTeo04RUq|0oIs&qi{P3nfil5nf-ShQaI=V%^+0XL zcEcI2#kWzOV#)Bx3Dk#B0=3_ItxBL`dj~9m>L*Ymwq;AyRE{dp5~%CDy}iTT9w=C& zg9K`SiK-GPTcRH$XhvM3mOxnoWeJo!(}T1})(9W{X(8`vy?>YRy!dW@0u)&qqn z^n_!cjGH?TCr~KZBDkhXpiHotV6z0OX*^QxE3srgTds6grMNL_Fg+ zObrjHhf_@YCx}hv0-FFdalBy%WivVxYowF4?H3~oT z?;^m_%XSYej(%!KnDcl^h@)RRpS|jge%%{YyNe$oUFg4JAi=?Tnj;P+0~ALiG(xm;QM57LW@E4g)^T1(}GZV zk3ci|;$J5(`2xamh;SfS4jswzm0KHzaPSbEB%)=|!Uc{zmcR(6vBwe`dMu%#AsikJ zlH)n~NM(#!&>|d7l=xE472se_!=s6D3NN$52-hr$_{DjI3tyaYpJ6Qih??-lMcAx5 zTd{pB zZvP21=loNmp}qgllX4tJw8%{`Qld7cWfXrZ3CPVJt*Qm?W@5*0Jb&O`RuNIHX@H5Q zfa5o-S!@8p$Fm8+fS39>wAkS9H|QQJ+8_wQ(~8MQEi@C~$FHpw)ubNc*VcvxG?$n} zLW|6FaYpuPp_x<^e5bLmQwz<_nu=eX*VGAuoj=A~5T<}$KP=b_z0Aa?!umu&P0h{mUJc};wJ;v5CUgdI;`zZi@jHtX;k~_W;h#i$XXHD_ zNR}1Zh+Nb*IKYe8Vos`2!FL$Dwl+?bkczfOCGkkwI1#5iQ(H@j6WMKtTKh#ykwN{m zG?9rawf0LyhS~}hJr-t;YwZ`_`Q?xC1;nR`VO2&<%iQ@lBSn=(lc9M#^LF0b8C7=9 zJ2R>rMyy0QOdCp;R4F3)EU8lD8w;SysxqqF*#_48HBIQEH?SI28dcWqIox3rQZ7}} zoin{sX;jIWZLO=E2~{r7A`LJWvBCjV7O~cEv+3A0GVG8cI1TnpgoKC{0t*H`bIe$c z4eQNj=%>k*GD)SD4e^=JAY+Z~UnqbLEn~He)iTzwNiwbskb-gL@wsv)&JlK(AMQ0Z zt~>$f*mfIP$QnI#gpu4iw(d;Hl|(sAtN{P9YbUWG!?Y*Z$jT?U37xELm#nrs2fjzR zapv39k|F62rb|{!h9YsJ4O!{2FbhcAor8DICAd5phO-wEAj}u~?KoYsWizcMxR&7h z1lM@DQ|&CcOSViNCN^VQaI@rLk%GvQhlwhgm7hNkLtl|O-ji;@Eo$o~v);_{4#UWV zz9P0~DIxwfrrq6dwjJF>G+wUbWqjkQJKoqK2?b=!=6KEVn&S=UX6ASUJxf3blO@mP zI$pYSrmUXkcqiiFnK)kPhbr3f8V?%}8xPNmhef22V+FWe9ws(}`;T=wr13DbCrj&x z=fT70OMDCN*^P@pR##PhRzGjP4EHCazxV3w=H{lk{#6Bi3-7j%uAi%VUq94Kwxa#? zkK1~^z1!{{-?_V6x*z}9_SK^LzvztY_2sTLgnJ8a`^cKL*d6+}ywj)tlnfgB_q@~4 zZ~mp>tM*b5ronCeT2#i(U{X@X&4i2Et}cny({^>?wV5LX6Ebc-ht`JN#Er}o*OEG2 z{O34*Y5Tfp5ivG0t)9#jtF?Vy&3cbN7M|4yZzOT-akWC|2daAhc5%MMm-!lfwMdI3 z=usI*9+U|l+TWxFy*Gj-SmdF%?LUE*OaJhq?x#jyjn*>y8m~kvVwg1gVi-nCUw{ET z@n%V1&0>!;Oab(@1OlrM(bty_yvBS5q`5B_+0m@`L_tIEbDGuM@qaKla{J%leMmI!I-3ogqR!;;(=t(6H1c230#bCdE>^I3w4!SSkW)9!tZtD*|fX z3rK((aQZM%e`P>zKn>~-ggVXl)5&ZK1nP!q<7!>2dQq)2idrjrMJ%8NY7q-)fm*}@ zJWwaF0AF(uuz({0C^wf|c>Y%_D6H@dSWQzt)99o9rcmR&@Kp{PUKCZ9Hje1-)-jF< zonDNP1WPiOp=l{X#6LVmMEK_g+VwA3gf z^(?8e0g|{zD}fqeSFE{45BaFPt(nCI_XNN84=JG~~7s1@{(cG{I zXcJ;*wuwrcI+x4JqP#pyH%!pdwrRMGCeXmIb;HMFW%rhx!(CM7hT(hlhayuF*OCd2 z{JB6)+O_wg>v8A~eF{#$?D>y|p8uNF-2S&T^t**<=~5r8@_Pj3veS6R+!DE}D3)39FbGx|I@U zhok#`fInlHq7A7_rDYLCWK7EpMWoTPRfjj-^C}5j%J;*|vq*3B_YO(z#YN3a7zo%} za>0@LVc-y-aC?_dQ0iSet`&Q2tn@YWTH=*do|BHMV7zv|U0DoFYjbFT0N!h-@h(H^ zT01N<@>)Ag7iWyTw)>f&1)h9Nx@zq(-8nPO`L3*l*6MHBy^f;R>ascATC4N5+7@az zp*8?f41$cr$8Pa567N}1Bk@Rc8a$lX^3;+z$qz=AjX~nBZ@X+3x4!LsS-cG~%$fVw zNuql;QXh7oF$YwpCE`ZvM(PjSH*0si!JF-$>q7-A;@yczy+OgaVif%rb6oKrTKN zwNCwbr0!0%KD^Vb08+Qswql%mD+q9Qm;DA@2CPc%ytS95k1IT$gW@A1pjm_`yU7r`J?B z7s`+B95>20$~VgIZjr&DnQ5j?a`}wu%+ci&M=+i<$42_nIiI2BG@H-#x@$R~o;fpF z%;pD=q8X!H?_&Mcjq8Y6T~+m}e%_>t?oXaMTQ=s_xYf_OL3O1I!+djI#qIzFH z)Jt|(z|%i&>-F|-yL)`+?r!OReBC-n^hf@1793r6KJ!Pjkdx|smcv_|OKCcnlvcHj zC(8V^lt&k5_%vbf7{5(Q%Xv6snH$$uvuQQNO9|R!n^s}`+WgkB!Db+bB3CPf0D!F5 zZx`;Rf%_QtdJn^%x`qC+81~x1)E9s4HrwB?ciYYW|AxP4Yz#w~^&2Ng#xPv2zKaG& ze$Oh7K5l#Oq$u@zSh3N*7mxmF@u`3Y!p2{bX;5&#hQ$P$g5iBF6wt*PoJqn0=Eiv}2yZbiUa))^3YHJ4g5`q= zmS(wl`9u@^a#F$a!30b5*cl+vqEzvM#Vz)pVA)^nD#3Ex#cqN{Bv=dzOt8#Tu>1y5 z>tdi;@J|zE3tq#N%O_AS^yCK_yrnIn6aPeQ`$*nwY7a=ja-{S>c8fgsT3n3E+DaEXoyEO=pN z?I)QR*j#70vhc@WBOWp#rdGa)v=NUh2elEuyJ(D^G!-24chGccD!s5=j=7b_#F2@k zH|u-JY1;_C3_#{+!VJXRQAuw1A8hPfy?+UC1!?8}cEBpOAJ9N-${KGgnHtMvF)}oh zNL1dbIXnX~7R2DC#Rrubdb7MYHJg2Gu@Kik{Fuf*G73g(A8~;)(YhA2B!H+@UU=V3 z^Nu$D#`}i9AZDqx_E7?eTK6cjro4Mhu%`UULxMaF-W69eAeuFupEYH`*8p<3Kh^*; zw8dl$fLxuQAH+ulpS%Sl8gn}ZXKMw9#xgUf873)2M%ii=qC|aa0Z^howF*(9K0mQQ zjH%DoT)B3jK&u;nG^Id&F3*ZaSF_{CvEv0+a}ih!Y+XgcSBgsoTanf0#byj_nSB7G zfNlTCl>OG30;H^2{rOpa2H2CVJ`*FD(jCSKIV#<`fNxc_(lzj%ANVo=nFPKCDqmEm z3?XS_21$gW1z(A(*2WA|gD<-`WfbANjAXdqI0u_|hHELq<0i!flG$9g#Uw+#vM;#DLsE&$0*MVpHLm_~Y2(S3x3{rPd2 z$x%g1)J4X?&ScZczz(N*4-SyqKh9fqfZTD9^EQbdmgr-j-_0!kB#92IG^Ss)afnEH zC56$<-b=pif)QS{KD);|?5|JN8B(@B+XGSkag$^K{CShj#F=!r&xq?&Vsmp`pAvG` zE>F?g=htVHs-Y z0y9$r$f!&&aMI!jc$q?}<9#y@L#qTNMLO-;7pV+h2pm)yY>|%7Ttp({u)`<-S4?Eg zPh=R*bmG@};7tDjxqbbjI>yBG%gpo(ay7H|lk^K?jI>E(srp5dz-Z-*2(x(oa!?!b z6??w*N+gcDyYveKEfYs3jy|UD%>ZPMCd@!AdiIj5A@vWCGZ5ST+?Gv=yE~&-ozbs* zBS$*jAyY$Yf0@{D+V_wY6(ch$&y?73kI;!LvHk&a`yhs0ARpUHZuW7G_K{IATKkAJ z!9?p?(303*t@0vzi^ulb_*)XKYe7q*>a^}rWKDVZm|#u$o?HwRj7$BN8ou4&j zz}EnBJXL1^$=Tli0do8Jh~SgAV6+3|sMnw#(br0pmR2F+wWkEY0{5gy)TdS&1snt#MnxxOp-hIW%|cD-4qr_>7x3*JAZOrf)$_YI z+dtQvMOYWmyk|hnVuoQBss&#p4Tg}kF@prywK0Q4Rcm90slk`sn=*>FyApc`1mtgiU5Y*Idgqr&>RUx6VTK=0X(y3c*=8UM@9lN6f3NNMfR0ngoUg?u(JAQmt~7sM3Vm z1e-yAV&NE5rSNEC$t=I)QKm|J=h&5rw6=HU7<*R?cn@rlW1Y5wK!mwaR8HVCMjShYDmgX4Tw9fi;}28tAJ+&#NG4l42Z1?KiDuI z)(f--7(lZqVwjz36|uyDXrl;;+SW!9l9(e`AZ9a^u>^7Nth+uAYmU3aYuG6oYna=! zY95dX8^c}0-6?UG))W&KXp;z$y5@m5EL$2hi08{=osHBx_pVG-x0qv=F$bgElVXkn z3w@@phk3I^X=DuY3_YcSeEk7hHC8J)jn3sxt1H;lUhf}l`nfAhePo{oyn^YN-M)C> zmpi=*XTR^SCFRhms=erpUUo)5wIiG_`t1u-o_$fuyRGOE#?^@yno0UFtAdZ9><;}RrB5$y(csy8s~w{`w2P*B%<&aD&Z3EM ziRKj5BXddeqCwVCEE-&Fe$g}_H={)Z|Dsqgls4+Y$2QTTp|2!<-~5^h8+g%znl% z9#;%1v>YgbitLwjZ@+oRs%Z{BJ~F+URg0hM9wrOPsz_RK3_0hx;w0qE^Jn;Z!?pP% zarh!NPtMhH)j344wB>ASnV71STn2%}7JH||PNc*~S?sZxwE*t6#XkOGZ)`+d@EWy` z*9LL4)$V+E(cEsnJM#9WXZ1!>P9IYjOz*oV5(X0YqV?VA0+{$u>jI7m6_L=q0A*lM zVNlU|ech=A21kR6s`qA-PGF#U<-n2iKl2dddk6`?o}`7C=omKC9L!N^d|nVv9z{UH zZ($K6kE=-XXg~rDEWfnHJp8_2bw=~7)6?VwIEhu?azqjd0_Ny2W`QISbd(Jx=3dOb zjN)DlhK9}eM-w!?vsgOD2wjKuutOt%^N#1$F0N-v=|z&PxFo^yQ(O#ba}K<3VE>rV z%2u#2wn}S#Bw~nnK4Du0*XA2K_}%*gTh)6*TWK(4`AmWNjDb|Ap#y=WK#&qirBOg? zwd%Zu%|HtF-E{Ju)60IQR8pc`DMu-V4~0QjZFLz@im6=ZD5Z{wQq|%9nF1W0NvX1= zRIw-}L6RhtGA#uMBfy^K_b=g;1c*jfPHryYqy*R`?$Ym~#gwJrd8OY|3NcX&p$V~J zxA92}F@U(Tm6$=CLELx+K9E}$+b^n4Ucz)kX=D4NdCgViq@Mz;CiZKk9Ds!u!XHRL zhNANY9|*+Y3t~_DDW#Suu~A92L4Lxw&zn z(Na%uxApt&-@OEMTt*yrd5Dqx1Nn44cyFND_b4IY5Lb2OkJkg+*!;SCN~7=YsKB8% z*B^X;%d0c^XdO`lPye{B*W0`8?(rRdhwjJx0D;VXP`k!|KJ;sAi$QtgFFZj1OKs2IH~+n~Z}re^B2a)*Zg ztR@;7RQ%1x?Sm>CDj59KLPax~;yA=ZMFI{fgir{FAn`ebL-?l7F}o=ZhfV^AU~R?$F;5ed_P1nG5YN)6l0}H1s|4 zH1wPPpd+Z-UH;hqw=}#cCFjC#2A;+gSlm2{_{FvUktSdK+DTx{bYmrfG2?r+U@QT8 z!(vGH4~f|V^0$AsK!Xdwu-I+o@K}t{Jiy}ii4h)^^>zV9S!rc5aTo(}T4-*bb9vZI zssI*Nj175kmPx|0vps_$z@0;qa8@uxYCXOtv@j$Qaat4-L7Y}0iXbjWi70|NUL*1l zcLv3a!$NEHHj436!|z(za9TpE6ys{Jv>8@m2=f?sIb&D~Cw+u@Ovu5 znXEHe_vnbPdS5@(ChI~iYlfepr!oe9I?Y&gp~N&}3mq%Ax$qMlnr@nLPMQ&Y8KdfW zCmWNDO!&$}UhLw`osU#`E*wxzkD4AeJsJ;RbJU}T&A0XSo!c+b92F4gs;nYSL0>x$ zW?hx=5QJw`wqY%0J-#eX!^`fczDbEbWtxI45WQbdS*Jl!&P0~lGK~a=wD2jaFY(3V zutda%@+yw|Y1$SZO{~hDrfnJF%xl~5Q=#<=KVx_PjCG!TkPAUX+2CwJgar|g4}WbB z?7GderanDbGc1z?*37lFV2wn;^zNTFR=c8G*o$Iggb!zsbL>Y00@I=MyC7lEs}dRx|b@N30gPizKXO>a(91t7AmKlu*AK zp@a&teBn%B4&YQAfJ8`Z(;1Yb@E62d7PRG9Bb7~{p}lMaV5Aahnsl*Jq}E%2y)aTa z6;hcr^Pikm7{RZ#DiX%BR7tnLGA#JP!b?sfK(nfcN%--d_omo znpD}s8(=XvZX7fEW(CkvF|TpjB;&T%mF;A#z{7c)g>ibM(R^nJV& zRb0#zapx*w-&xVF$oT41_zG7uqxZ8d6fu#K)?i6EJgq~3dEhx3tYZ0!>MdrlBJBs* z(`JOV^DpBhRBVsz!;ya#CyxAUmvH1?{sto;4*w#~X6m)qp<&WVfbFH4odw;YzbX3E z-xUt$@=7{U#X4KYVt_wi#$u@UG#QI|DB9gOgejH?#!jw1iv)sL)3JUz>zB);Uk;Zu z1N_?UH)vJhFNhTnw4{Rf%|M*i21y`J%PAzY;AiZiXdCA6frAE#U0d7AAd)soq{2jk zX=NtHVx{%VF>{>nmy2~_fVYi(pWlqqj2G+hE`YJDQQ~A9B@nEQz}%}8s@8E&Nnj!# z$d|wv$zTK)+FO-IV2s!qgTP!-t8C9@MXggQYN1GH)`T`9G%`HU;4#ter((%aR4_9# zERqb*;9Q}@M~fD)U-$b~$0Af-bVj~FMKId=%)h7#KP$anR?2&|rsOjbzGfRwGOjc! zo@9KCHlCyh&RksU?&#u7%%{yKQTi~~))E?pwlL9SL0nos3AZ1+cv?Qms7>D9i4L?b zoOR5BnFC8d&)OI(aN-g$(GYe)G%q&wm8Y^Zy;_YS;@ZN&{MM1rCA${YA*0!R4d*b+i@5k@y_JN!kaR!fKYT{4%1DtO}zV`<%l$f{`Vgv6?(kqSJTSdmOiDk8?>NhN`?_*#>U zvC8fPH^wr?+Kbn;30SF&CBe^p8H)+m@?|XRA>V8}xwpt;r83rUi^X#Bb^(X|xi<2% z8+Qk*EBIgdLBsRz4>e8;uW@s;S$y%=ZnORUdbi!||8Mx~+=nP72BH3v4fUV?aa*sq zciY|LJ9mRi_apg03S9h(EljZNRy%*V9w`~&k$?0(zi78q(a`UyqM^Sd8vfk=G78Cy zU#4uvqc3sr{b-9W^nl|6Qp-4aZN|rFJyP?P#&29}lA7;4esL}5(A11yTWgY<*%`mM zmUAQyOly+x+VEszK1OSj=%Fwd*P5he#mDblYmy|=N5MD_=dV@>(pOc(I8WgH9qyg! z#ZyjlJ9wFt(=rgi?bPky#z2d++xxXK5Oj#_?t3a(gonmJfzB2!12rpt4h*!Mh3PQ{ zDvW_dNUk+Oba5u4*Sa8yd()aAiF?y}Adw{3njm=M;QixxsAV7#y(O6-cr;U(Ac?^` zbq1mngNk=R7!5jwir=~7+@L}nP(dl1F-*uSOwTY3nC8eZM2)~RF%~#Ah7sw70#def z=WwDSn`V2e)pcOC_(aAN9)YeuDX*DBZBRzT+17Q8ag1@^tVM6RHO2|#n3FP&$VKvn zop5Izsw!rKk>Z?%Lq(%m@R>z+7TNv%-U#AOi|ky5*uPAp(Mxe(lB1y#fvp*;Kt)Px zs6-X10_g1w&rXQsfzOyRM4}cbD6c%St#Xat=6B>|MS(T$46eX})Jt%?|ZJIHq9u#?NKFtu}A?!k9^x#)T z7+}+brU!qFgX_rwpu}uz@lWJgwSrIrt~~yQOJ^ch;J2ZPz}1P(S)i+lO%t2GN>pt4 zIH}s45^P17l_S`Sl}#N8otGEV!;&2;Pz{dqmmy;$uo z{dqk!yeMeDPxB9mJ=#`;(*oXRa~_c}e-DIm3M? z;OC0_A~T+V`;!&&L47Y>{Xt_>9ZuGD>*{p9Y`o zNu*)9(pa6i7Sh%M$yLR#t%Xk!Hf!-yg3Vg^6k)R#K1JBf<7arTNU)ig&2ntENW<;l zwn(FXs8=k!3184Pa?9?d(w4sYID95RGar&`u~~xqT4)yGz80G$xUYp~5$d5v^GeIgunzVTe}CDH6WYQi_PLw3H%Zah_8m z>j%Em*c|7N=MeFgYuA|rTbaoTuvI}OM*`~!by+@OO5!7i>$H4C2*yBfj(j9SN8_Ww zHvBX`V*FD9d_<{R^aS2A?2{2v*@R@NTj+dVW2DeiK^Mc&^ocs8@)^nXW^r2@$|wX` zq8Ff~w&X<-O2JzijIct`Tr6QtAL&6L84H-SghfxBxwtmd6B$dbhr$ekiQRY$C5qh| zQkj9zF;X#7QI?5BO{mD z5V*vo>&X?0B-Jx*ok--Mw1fm-C%a`*TPG4huh!*AB0nu5NtzY=s-?BA}kEr9t8|cfhidM%9SVe0NdDy;L-`3Z6Z?=D~ zH;eFkBQh-Tlw%1QYKc}vhFYeTkfD}nMU5v~rnT5qYu&X>i*uZ5YT=1iOoq-%#ts2B zGNeCA}5%}jlc!5*i*YvknViSO!Za##|p8QV3K1` zXbB@=>A=j-ol${@s1#KC7sZ$s0)&9wLI4W^))oSMnleDaAwd1%?ZUkp(7kZBKEH7? zdv#US=k* zgS4h#9{;p`~ z?}~>0u4w4*iiZBKaQMRC6@u3be^+=Y1+tP?j;(`HvZGQtj7WgY-cdiXL znq?EexVEXWS-|m&^BoR@EohD`{I+t(SI)1-=t}c$^dr;Nzgna=x&qtXEU5OG5aoDc zprI!Q8hT=&p(h3!9*?e!ftr_}09C{g>*N?ngyTsJbjh?dY8eQZ5JzuX1`-jomVxNv zj9%dxD6HW`^v37wae9MOQ&tPda7c0ch73Px)|l3l;WdtG%Z%sti?YqiGM;PRm(@&X znwCYFEr}u0QXQce{EYFa)@?~#qSkDQkTH$wVCBS~ik9j`grTK6k-^nc9bG(`>V#}G z@LPR0(pWBIBbd}wbOe&{{>DPWYIn(mgj2dMVcI& zjp&gxRHtPl5!I!!5pYtBq_xINM0KMXFM5f>UL~wgbw4=y{*lEnqdKEHVESeaw*HTX zMs<=m{!MATCZ#&Oai&(JWjeZX_#0xdtED;#jcS=rWR2384i;DJsc4xFpUOl;ClVJt z(S_qnk+{$v$te;S+A}$6vM}GLb{CIGT=2ViWIh^fw0o^Y@#5ln%V4+~ymKYQ=zQku za`>4q%i+j3k>JSpkRV|C781h!_L>Bydn0SM@lUE*gYCniAtNr@UnbX%$J`;U%aaxtMYlU}#3GV$_>h-J1yMZ}xhO-dB%#X(#fsZc~1zqYnf z0X+xflC{+v5wmF%6_Wn&GgbuAmTyF2LR-EO)rPc6MHKh*jS8Z;-)*by?^EhCTN!W6qeGM4fria_&l?PGZhhS z@+MqNn@}a6L7CiW(`eIZb6&KGW^vWC9) zWRpw(m>Ps{pC}FegQDS>4Snrjq=~NE{}%j8wR`c4mE5u9Uaa4rQn-ki?nH$PEb7vP zOS9tRH*Ufus4Yt94Sa&z`8&sb_UjVgI`q$elcViJm}kIZT0_1CKgZgFNF-@%3yp@3 zvr~>@gPsc0OrT9kM0tQVC80;o6fV+g3rq&1RU4wh1s{`03m5D;Ic4yw6)xtwkLJ@>_OvM9MQP7Ye#GB}y{y5AMyb&ZpalYN1ufi2_Y=`(pURui3rhoL;< ztF%5@WH+@wSw#5Sb^P63a#8{VuD~U%UCFrA=H3qQ*h=Jia3iN8Z=sXzJ5l;1P}Xt`P|UB<-1o zg&J!gN~-?CI*nbM-!nZ>y+Q;$ziz4vdrae$`)vi;C2+k4hB}<6ceYI;1Lx;9jIy;L!3k% z(`QU>o(B|DffK!a>L_K}&pzy@&a_u5rX%CKO?y*J*LQEWe;%xCOT708ipf>w zoF??;BlJWc2cCFGxjbB^a#L`Uk_{FtNEX}fsUtHV7LJK{9ZN7(_$a zk;o2T%ZJxzEu!IC$P8ZFd&3r@i>Y7hHzwMOCb|}GiJ9UPT?@AonlQK>h9X+HmC&0O PZJT8ipC}ok?aluKDCW#G literal 0 HcmV?d00001 diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index d73515cb1..b09ddebe7 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -80,10 +80,13 @@ %% %% Step 3 gives a best effort solution, but given the enormous amount of %% possible operations, it can take while to return. But it always terminate. +%% In order to make it terminate, we added heuristics in "worth_brute_force". +%% If we consider the number of violations too large for success, we just +%% don't do Step 3 or only partly until we solve the Nval for nodes. %% %% When we update a ring, then we want as little transfers as possible, %% so first an effort is performed to just swap nodes. If that would not -%% work to get a solution, a brute-force attempt is taken to get best-effort +%% work to get a solution, a solve attempt is taken to get best-effort %% again. @@ -91,7 +94,7 @@ -export([solve/3, update/3, - zero_violations/2, + node_loc_violations/2, moves/2, to_list/1, from_list/1]). @@ -105,7 +108,9 @@ -ifdef(DEBUG). -compile([export_all, nowarn_export_all]). --define(DEBUG_FUNS). +-ifndef(DEBUG_FUNS). +-define(DEBUG_FUNS, true). +-endif. -define(PROFILE, true). -include_lib("eqc/include/eqc_profile.hrl"). -define(debug(Fmt, Args), io:format(Fmt, Args)). @@ -120,10 +125,11 @@ %% byte node index. -type ring() :: binary(). -type ring_size() :: non_neg_integer(). --type nval() :: non_neg_integer(). +-type nval() :: pos_integer(). -type node_nval() :: nval(). -type loc_nval() :: nval(). --type nvals() :: nval() | {node_nval(), loc_nval()}. +-type nvals() :: {node_nval(), loc_nval()}. +-type nvalsmap() :: #{node => node_nval(), location => loc_nval()}. -type config() :: [non_neg_integer()]. %% List of node counts per location @@ -197,7 +203,10 @@ moves(Ring1, Ring2) -> -type violations() :: {non_neg_integer(), non_neg_integer()}. -define(zero_v, {0, 0}). --define(is_zero_v(V), element(1, V) == 0 andalso element(2, V) == 0). +-define(is_zero_v(V), (element(1, V) == 0 andalso element(2, V) == 0)). + +to_nvals(#{location := LNVal, node := NVal}) -> + {NVal, LNVal}. zip_v(F, {A1, B1}, {A2, B2}) -> {F(A1, A2), F(B1, B2)}; zip_v(F, {A1, B1, C1}, {A2, B2, C2}) -> {F(A1, A2), F(B1, B2), F(C1, C2)}; @@ -212,16 +221,23 @@ sub_v(V1, V2) -> zip_v(fun erlang:'-'/2, V1, V2). -spec sum_v([violations()]) -> violations(). sum_v(Vs) -> lists:foldl(fun add_v/2, ?zero_v, Vs). --spec zero_violations(ring(), nvals()) -> boolean(). -zero_violations(Ring, NVals) -> - V = violations(Ring, NVals), - ?is_zero_v(V). +-spec node_v(violations()) -> non_neg_integer(). +node_v(V) -> + element(2, V). + +-spec loc_v(violations()) -> non_neg_integer(). +loc_v(V) -> + element(1, V). + +-spec node_loc_violations(ring(), nvalsmap()) -> {non_neg_integer(), non_neg_integer()}. +node_loc_violations(Ring, NValsMap) -> + V = violations(Ring, to_nvals(NValsMap)), + {node_v(V), loc_v(V)}. %% What's the maximum distance from an updated vnode where a violation change %% can happen. -spec max_violation_dist(nvals()) -> non_neg_integer(). -max_violation_dist({N, L}) -> max(N, L); -max_violation_dist(N) -> N. +max_violation_dist({N, L}) -> max(N, L). -spec violations(ring(), nvals()) -> violations(). violations(Ring, NVals) -> @@ -237,13 +253,9 @@ violations(Ring, NVals, VNodes) when is_list(VNodes) -> sum_v([ violations(Ring, NVals, I) || I <- VNodes ]); violations(Ring, NVals, VNode) -> ?BENCHMARK(violations, begin - {NVal, LVal} = case NVals of - {N, L} -> {N, L}; - N -> {N, N} - end, + {NVal, LVal} = NVals, Locs = fun(Ns) -> [ L || {L, _} <- Ns ] end, - NV = window_violations( window(Ring, VNode, NVal), NVal), - + NV = window_violations(window(Ring, VNode, NVal), NVal), LocV = fun(D) -> window_violations(Locs(window(Ring, VNode, LVal + D)), LVal + D) end, LV = LocV(0), {LV, NV} @@ -315,9 +327,12 @@ brute_force(Ring, NVals, Options, V) -> brute_force(Ring, NVals, Options, V, OrigSwaps) -> TryHard = proplists:get_bool(try_hard, Options), + AlwaysBruteForce = proplists:get_bool(brute_force, Options), + StopNodeOnly = proplists:get_bool(node_only, Options) andalso node_v(V) == 0, case V of _ when not TryHard, ?is_zero_v(V) -> Ring; ?zero_v -> Ring; + _ when StopNodeOnly, not AlwaysBruteForce -> Ring; _ -> N = ring_size(Ring), Swaps = @@ -411,10 +426,33 @@ swap(Ring, I, J) -> Y = get_node(Ring, J), set_node(set_node(Ring, I, Y), J, X). +worth_brute_force(RingSize, V) -> + NodeOnly = node_v(V) < RingSize div 2, + Full = loc_v(V) + node_v(V) < RingSize, + if Full -> brute_force; + NodeOnly -> node_only; + true -> no_brute_force + end. + +maybe_brute_force(Ring, NVals) -> + case worth_brute_force(ring_size(Ring), violations(Ring, NVals)) of + brute_force -> brute_force(Ring, NVals); + node_only -> brute_force(Ring, NVals, [node_only]); + no_brute_force -> Ring + end. + + %% -- The solver ---------------------------------------------------------- --spec solve(ring_size(), config(), nvals()) -> ring(). -solve(RingSize, Config, NVals) -> +-spec solve(ring_size(), config(), nvalsmap()) -> ring(). +solve(RingSize, Config, NValsMap) -> + solve(RingSize, Config, NValsMap, []). + +-spec solve(ring_size(), config(), nvalsmap(), proplists:proplist()) -> ring(). +solve(RingSize, [1], _NValsMap, _) -> + from_list(lists:duplicate(RingSize, {1,1})); +solve(RingSize, Config, NValsMap, Options) -> + NVals = to_nvals(NValsMap), NumNodes = lists:sum(Config), Rounds = RingSize div NumNodes, AllNodes = nodes_in_config(Config), @@ -426,18 +464,31 @@ solve(RingSize, Config, NVals) -> BigRingD = solve_node_deletions(Cycle(Rounds + 1), NVals, ToRemove), VD = violations(BigRingD, NVals), ?debug("Delete\n~s\n", [show(BigRingD, NVals)]), + NoBruteForce = proplists:get_bool(no_brute_force, Options), + AlwaysBruteForce = proplists:get_bool(brute_force, Options), case VD of - ?zero_v -> brute_force(BigRingD, NVals); + ?zero_v -> BigRingD; + _ when NumNodes > RingSize -> + %% Should not ask for this case + if NoBruteForce -> BigRingD; + AlwaysBruteForce -> brute_force(BigRingD, NVals); + true -> maybe_brute_force(BigRingD, NVals) + end; _ -> BigRingI = solve_node_insertions(Cycle(Rounds), NVals, Extras), ?debug("Insert\n~s\n", [show(BigRingI, NVals)]), VI = violations(BigRingI, NVals), - if VI < VD -> + BFRing = + if VI < VD -> ?debug("Chose insert\n", []), - brute_force(BigRingI, NVals); + BigRingI; true -> ?debug("Chose delete\n", []), - brute_force(BigRingD, NVals) + BigRingD + end, + if NoBruteForce -> BFRing; + AlwaysBruteForce -> brute_force(BigRingD, NVals); + true -> maybe_brute_force(BFRing, NVals) end end. @@ -507,8 +558,11 @@ nodes_in_ring(RingSize, Config) -> X = RingSize div lists:sum(Config), lists:append(lists:duplicate(X, nodes_in_config(Config))) ++ extra_nodes(RingSize, Config). --spec update(ring(), config(), nvals()) -> ring(). -update(OldRing, Config, NVals) -> +-spec update(ring(), config(), nvalsmap()) -> ring(). +update(OldRing, [1], NValsMap) -> + solve(ring_size(OldRing), [1], NValsMap); +update(OldRing, Config, NValsMap) -> + NVals = to_nvals(NValsMap), %% Diff old and new config RingSize = ring_size(OldRing), OldNodes = to_list(OldRing), @@ -517,8 +571,7 @@ update(OldRing, Config, NVals) -> ToRemove = OldNodes -- NewNodes, %% Swap in new nodes for old nodes (in a moderately clever way) NewRing = swap_in_nodes(OldRing, ToAdd, ToRemove, NVals), - %% Brute force fix any remaining conflicts - brute_force(NewRing, NVals, []). + maybe_brute_force(NewRing, NVals). swap_in_nodes(Ring, [], [], _NVals) -> Ring; swap_in_nodes(Ring, [New | ToAdd], ToRemove, NVals) -> @@ -559,12 +612,14 @@ show(Ring, NVals) -> [ [io_lib:format(Color(V, "~c~p "), [L + $A - 1, I]) || {{L, I}, V} <- lists:zip(to_list(Ring), Vs)] , pp_violations(TotalV) ])). -show_solve(RingSize, Config, NVals) -> - io:format("~s\n", [show(solve(RingSize, Config, NVals), NVals)]). +show_solve(RingSize, Config, NValsMap) -> + NVals = to_nvals(NValsMap), + io:format("~s\n", [show(solve(RingSize, Config, NValsMap), NVals)]). -show_update(RingSize, OldConfig, NewConfig, NVals) -> - OldRing = solve(RingSize, OldConfig, NVals), - NewRing = update(OldRing, NewConfig, NVals), +show_update(RingSize, OldConfig, NewConfig, NValsMap) -> + NVals = to_nvals(NValsMap), + OldRing = solve(RingSize, OldConfig, NValsMap), + NewRing = update(OldRing, NewConfig, NValsMap), io:format("Old\n~s\nNew\n~s\nDiff=~p\n", [show(OldRing, NVals), show(NewRing, NVals), moves(OldRing, NewRing)]). -endif. @@ -572,50 +627,37 @@ show_update(RingSize, OldConfig, NewConfig, NVals) -> -ifdef(TEST). -generate_swaps_test() -> - time_generating_swaps(32), - time_generating_swaps(128), - time_generating_swaps(1024). - -time_generating_swaps(N) -> - SW = os:timestamp(), - Swaps = generate_swaps(N, []), - io:format( - user, - "Generate swaps for RS ~w in ~w ms length ~w~n", - [N, timer:now_diff(os:timestamp(), SW) div 1000, length(Swaps)]). - %% -- Unit tests for experimentation --------------------------------------- %% These tests take a bit of time when running. %% Not intended to be included in automatic testing. known_hard_tests() -> - Tests = [ {16, [4, 3, 3, 2], 3} - , {32, [3, 2, 1, 4, 3], 3} - , {32, [5, 6, 5, 1, 1], 3} - , {128, [1, 1, 1, 1, 1, 1], 5} - , {16, [4, 4, 4, 3], 4} - , {16, [4, 4, 3, 3], 4} - , {16, [4, 3, 3, 3], 4} - , {32, [4, 3, 3, 3], 4} - , {48, [4, 3, 3, 3], 4} - , {32, [2, 2, 2, 2, 2], 4} - , {16, [2, 2, 1, 2, 2], 4} - , {16, [2, 2, 4, 2], 4} - , {16, [3, 2, 2, 2], 4} - , {32, [3, 2, 2, 2], 4} - , {32, [3, 3, 3, 1, 1], 4} - , {16, [1, 3, 2, 1, 1, 1], 4} - , {64, [2, 2, 1, 2, 2, 2], 5} - , {256, [6, 5, 2], 2} - , {64, [3, 3, 3, 2, 1], 4} - , {32, [3, 3, 3, 3, 1], 4} - , {512, [4, 4, 4, 4, 1], 4} + Tests = [ {16, [4, 3, 3, 2], 3, ?zero_v} + , {32, [3, 2, 1, 4, 3], 3, ?zero_v} + , {32, [5, 6, 5, 1, 1], 3, ?zero_v} + , {128, [1, 1, 1, 1, 1, 1], 5, ?zero_v} + , {16, [4, 4, 4, 3], 4, ?zero_v} + , {16, [4, 4, 3, 3], 4, ?zero_v} + , {16, [4, 3, 3, 3], 4, ?zero_v} + , {32, [4, 3, 3, 3], 4, ?zero_v} + , {48, [4, 3, 3, 3], 4, ?zero_v} + , {32, [2, 2, 2, 2, 2], 4, {2,0}} + , {16, [2, 2, 1, 2, 2], 4, ?zero_v} + , {16, [2, 2, 4, 2], 4, ?zero_v} + , {16, [3, 2, 2, 2], 4, ?zero_v} + , {32, [3, 2, 2, 2], 4, {8, 0}} + , {32, [3, 3, 3, 1, 1], 4, {16,0}} + , {16, [1, 3, 2, 1, 1, 1], 4, {4, 0}} + , {64, [2, 2, 1, 2, 2, 2], 5, ?zero_v} + , {256, [6, 5, 2], 2, ?zero_v} + , {64, [3, 3, 3, 2, 1], 4, {4,0}} + , {32, [3, 3, 3, 3, 1], 4, {4,0}} + , {512, [4, 4, 4, 4, 1], 4, {4,0}} ], [ {Size, Config, NVal, '->', V} - || {Size, Config, NVal} <- Tests - , V <- [violations(solve(Size, Config, NVal), NVal)] - , not ?is_zero_v(V) + || {Size, Config, NVal, Expect} <- Tests + , V <- [violations(solve(Size, Config, #{location => NVal, node => NVal}), {NVal, NVal})] + , V /= Expect ]. typical_scenarios_tests() -> @@ -637,10 +679,10 @@ typical_scenarios_tests() -> fun(_Config, Err={error, _}) -> Err; (Config, {undefined, Diffs}) -> - {solve(Size, Config, NVal), Diffs}; + {solve(Size, Config, #{location => NVal, node => NVal}), Diffs}; (Config, {OldRing, Diffs}) -> - NewRing = update(OldRing, Config, NVal), - V = violations(NewRing, NVal), + NewRing = update(OldRing, Config, #{location => NVal, node => NVal}), + V = violations(NewRing, {NVal, NVal}), Diff = moves(OldRing, NewRing), if ?is_zero_v(V) -> {NewRing, Diffs ++ [Diff]}; true -> {error, {Size, OldRing, NewRing, Config, V}} @@ -648,11 +690,30 @@ typical_scenarios_tests() -> end, {undefined, [0]}, Tests) || Size <- [64, 128, 256, 512, 1024] ], + HistoricDiffs = + [[0,56,8,8,6,8,26,5,4,4], + [0,112,15,12,12,14,9,12,9,8], + [0,224,29,31,24,21,21,21,23,16], + [0,448,57,59,47,48,39,45,36,32], + [0,896,114,119,94,85,78,87,79,64]], case [ Err || {error, Err} <- Results ] of - [] -> {ok, [ Diff || {_Ring, Diff} <- Results ]}; + [] -> + true = + lists:all(fun({L1, L2}) -> + lists:sum(L1) =< lists:sum(L2) + end, lists:zip([ Diff || {_Ring, Diff} <- Results ], HistoricDiffs)); Errs -> {error, Errs} end. +wcets() -> + [ io:format("Size ~4b Time ~.1f sec\n", [Size, wcet(Size, 4) / 1000000]) + || Size <- [16, 32, 64, 128, 256, 512, 1024, 2048] ]. + +wcet(RingSize, NVal) -> + NValMap = #{location => NVal, node => NVal}, + Ring = solve(RingSize, [1], NValMap), + {T, _} = timer:tc(fun() -> brute_force(Ring, {NVal, NVal}) end), + T. -ifdef(EQC). @@ -664,7 +725,7 @@ ring() -> non_empty(list(pnode())). nvals() -> ?LET(NVal, choose(1, 5), ?LET(LVal, choose(1, NVal), - if NVal == LVal -> NVal; true -> {NVal, LVal} end)). + {NVal, LVal})). op(N) -> Ix = choose(0, N - 1), @@ -717,15 +778,46 @@ prop_swap_violations() -> prop_no_locations() -> ?FORALL({Size, Nodes, NVal}, {elements([16, 32, 64, 128, 256, 512]), choose(1, 64), choose(1,5)}, begin - {OneT, OneRing} = timer:tc(?MODULE, solve, [Size, [Nodes], {NVal, 1}]), + {OneT, OneRing} = timer:tc(fun() -> solve(Size, [Nodes], #{node => NVal, location => 1}) end), {_, OneViolations} = violations(OneRing, {NVal, 1}), - {SepT, SepRing} = timer:tc(?MODULE, solve, [Size, lists:duplicate(Nodes, 1), NVal]), - {_, SepViolations} = violations(SepRing, NVal), + {SepT, SepRing} = timer:tc(fun() -> solve(Size, lists:duplicate(Nodes, 1), #{node => NVal, location => NVal}) end), + {_, SepViolations} = violations(SepRing, {NVal, NVal}), measure(one_location, OneT, measure(sep_location, SepT, equals(OneViolations, SepViolations))) end). +config_gen() -> + ?LET(N, choose(1,7), vector(N, choose(1,8))). + +prop_brute_force_optimize() -> + in_parallel( + ?FORALL({Size, Config, NValsMap}, {elements([16, 32, 64, 128, 256, 512, 1024]), + config_gen(), + ?LET(N, choose(2,5), #{node => N, location => default(N, choose(2, N))})}, + ?IMPLIES(length(Config) >= maps:get(location, NValsMap), + begin + NVals = to_nvals(NValsMap), + {T1, Ring1} = timer:tc(fun() -> solve(Size, Config, NValsMap, [no_brute_force]) end), + [{_, RV1}, {_, RV2}, {_, RV3}] = Res = + case violations(Ring1, NVals) of + ?zero_v -> [{T1, ?zero_v}, {0, ?zero_v}, {0, ?zero_v}]; + V1 -> + {T2, Ring2} = timer:tc(fun() -> brute_force(Ring1, NVals, [node_only]) end), + V2 = violations(Ring2, NVals), + {T3, Ring3} = timer:tc(fun() -> brute_force(Ring2, NVals) end), + V3 = violations(Ring3, NVals), + [{T1 / 1000000, V1}, {T2 / 1000000, V2}, {T3 / 1000000, V3}] + end, + Improved = length(lists:usort([ RV1, RV2, RV3 ])) > 1, + WorthBF = worth_brute_force(Size, RV1), + FailNodeOnly = node_v(RV2) == 0 andalso WorthBF == no_brute_force, + FailBruteForce = ?is_zero_v(RV3) andalso WorthBF /= brute_force, + ?WHENFAIL(eqc:format("Worth brute force ~p for ~p\n", [WorthBF, Res]), + aggregate([{Size, Config, NValsMap, Res, WorthBF} || Improved andalso WorthBF /= brute_force], + conjunction([{node_only, not FailNodeOnly}, + {brute_force, not FailBruteForce}]))) + end))). -endif. -endif. diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 6856ad64d..0fd018d44 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -59,8 +59,8 @@ -spec memoize( binring_solve|binring_update, {binary()|pos_integer(), - list(pos_integer()), - {pos_integer(), pos_integer()}}, + list(non_neg_integer()), + #{location := pos_integer(), node := pos_integer()}}, fun(() -> binary())) -> binary(). memoize(Registry, Key, Fun) -> V4Solutions = @@ -107,7 +107,7 @@ claim(Ring, Params0) -> true -> 1 end, RingSize = riak_core_ring:num_partitions(Ring), - NVals = {TargetN, TargetLN}, + NVals = #{node => TargetN, location => TargetLN}, %% Now we need to map the locations and nodes to a configuration that %% basically is a list of locations with the number of nodes in it. @@ -119,40 +119,23 @@ claim(Ring, Params0) -> {BinRing0, OldLocRel} = to_binring(Ring), {Config, LocRel} = to_config(Ring, OldLocRel), - SWup = os:timestamp(), - BinRing1 = - memoize( - binring_update, - {BinRing0, Config, NVals}, - fun() -> - riak_core_claim_binring_alg:update(BinRing0, Config, NVals) - end), - lager:info( - "~w Swapping algorithm update in ~w ms Config ~w NVals ~w h(BinRing) ~w", - [self(), timer:now_diff(os:timestamp(), SWup) div 1000, - Config, NVals, erlang:phash2(BinRing1)] - ), - - SWsv = os:timestamp(), - UpdateSolved = - riak_core_claim_binring_alg:zero_violations(BinRing1, NVals), BinRing = - case UpdateSolved of - false -> - memoize( - binring_solve, - {RingSize, Config, NVals}, - fun() -> - riak_core_claim_binring_alg:solve(RingSize, Config, NVals) - end); + case length(OldLocRel) == 1 of true -> - BinRing1 + %% Only one node in old ring, don't even try update + solve_memoized(RingSize, Config, NVals); + false -> + BinRingU = update_memoized(BinRing0, Config, NVals), + case riak_core_claim_binring_alg:node_loc_violations(BinRingU, NVals) of + {0, 0} -> BinRingU; + UV -> + BinRingS = solve_memoized(RingSize, Config, NVals), + SV = riak_core_claim_binring_alg:node_loc_violations(BinRingS, NVals), + if SV < UV -> BinRingS; + true -> BinRingU + end + end end, - lager:info( - "~w Swapping algorithm solve in ~w ms as solve_required=~w", - [self(), timer:now_diff(os:timestamp(), SWsv) div 1000, - not UpdateSolved] - ), Inc = chash:ring_increment(RingSize), SolvedNodes = @@ -171,6 +154,41 @@ claim(Ring, Params0) -> NewRing. +update_memoized(BinRing, Config, NVals) -> + TS = os:timestamp(), + BinRingU = + memoize( + binring_update, + {BinRing, Config, NVals}, + fun() -> + riak_core_claim_binring_alg:update(BinRing, Config, NVals) + end), + lager:info( + "~w Swapping algorithm update in ~w ms Config ~w NVals ~w h(BinRing) ~w", + [self(), timer:now_diff(os:timestamp(), TS) div 1000, + Config, NVals, erlang:phash2(BinRingU)] + ), + BinRingU. + +solve_memoized(RingSize, Config, NVals) -> + TS = os:timestamp(), + BinRingS = + memoize( + binring_solve, + {RingSize, Config, NVals}, + fun() -> + riak_core_claim_binring_alg:solve(RingSize, Config, NVals) + end), + lager:info( + "~w Swapping algorithm solve in ~w ms", + [self(), timer:now_diff(os:timestamp(), TS) div 1000] + ), + BinRingS. + +claiming_nodes(Ring) -> + Claiming = riak_core_ring:claiming_members(Ring), + LocationDict = riak_core_ring:get_nodes_locations(Ring), + [ {riak_core_location:get_node_location(N, LocationDict), N} || N <- Claiming ]. to_binring(Ring) -> LocationDict = riak_core_ring:get_nodes_locations(Ring), @@ -199,10 +217,7 @@ to_binring(LocationRing, LeavingMembers) -> {riak_core_claim_binring_alg:from_list(Nodes), LocationRel}. to_config(Ring, OldLocRel) -> - Claiming = riak_core_ring:claiming_members(Ring), - LocationDict = riak_core_ring:get_nodes_locations(Ring), - LocationNodes = [ {riak_core_location:get_node_location(N, LocationDict), N} || N <- Claiming ], - to_config2(LocationNodes, OldLocRel). + to_config2(claiming_nodes(Ring), OldLocRel). to_config2(LocationNodes, FixedLocRel) -> OldLocIdxs = lists:usort([ {LI, L} || {{LI, _}, {L,_}} <- FixedLocRel ]), @@ -275,6 +290,20 @@ simple_cluster_t1_test() -> claim(R1, Props), ?assert(true, riak_core_membership_claim:meets_target_n(RClaim, TargetN)). +locs_and_no_locs_test() -> + RingSize = 32, + TargetN = 2, + NodeList = [{n1, loc1}, {n2, loc2}, n3, n4], + R0 = riak_core_ring:set_node_location(n1, loc1, riak_core_ring:fresh(RingSize, n1)), + Params = [{target_n_val, TargetN}], + RClaim = add_nodes_to_ring(R0, n1, NodeList -- [{n1, loc1}], Params), + ?assert(true, riak_core_membership_claim:meets_target_n(RClaim, TargetN)), + RMove = riak_core_ring:set_node_location(n1, loc3, RClaim), + RClaim2 = claim(RMove, Params), + ?assertEqual(riak_core_ring:all_owners(RClaim2), + riak_core_ring:all_owners(RClaim)). + + location_t1_test_() -> JoiningNodes = @@ -352,7 +381,9 @@ add_nodes_to_ring(Ring, Claimant, NodeLocList, Params) -> NewRing = lists:foldl( fun({N, L}, AccR) -> AccR0 = riak_core_ring:add_member(Claimant, AccR, N), - riak_core_ring:set_node_location(N, L, AccR0) + riak_core_ring:set_node_location(N, L, AccR0); + (N, AccR) -> + riak_core_ring:add_member(Claimant, AccR, N) end, Ring, NodeLocList), diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index a9debc7fe..08274495c 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -66,8 +66,8 @@ -type v4_solution() :: {{binring_solve|binring_update, {binary()|pos_integer(), - list(pos_integer()), - {pos_integer(), pos_integer()}}}, + list(non_neg_integer()), + #{location := pos_integer(), node := pos_integer()}}}, binary()}. -record(state, { @@ -125,7 +125,7 @@ plan() -> %% A commit is only allowed to succeed if the ring is ready and if the %% current set of changes matches those computed by the most recent %% call to plan/0. --spec commit() -> ok | {error, term()}. +-spec commit() -> ok | {error, term()} | error. commit() -> gen_server:call(claimant(), commit, infinity). diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index 389106dc1..b33220a6b 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -35,15 +35,19 @@ committed_nodes = [], staged_nodes = [] :: [Name :: atom()], %% nodes added/left before claim, plan = [], %% staged nodes after claim + sufficient = false, with_location = false }). %% -- State and state functions ---------------------------------------------- initial_state() -> - initial_state(3). + initial_state(#{}). -initial_state(Nval) -> - #state{nval = Nval}. +initial_state(Map) -> + #state{nval = maps:get(nval, Map, 4), + ring_size = maps:get(ring_size, Map, 32), + sufficient = maps:get(sufficient, Map, false), + with_location = maps:get(with_location, Map, true)}. %% -- Generators ------------------------------------------------------------- @@ -92,7 +96,7 @@ add_claimant_pre(S) -> S#state.claimant == undefined. add_claimant_args(S) -> - [hd(S#state.placements), S#state.with_location, ringsize()]. + [hd(S#state.placements), S#state.with_location, S#state.ring_size]. add_claimant_pre(S, [LocNode, _, RingSize]) -> LocNodes = S#state.placements, @@ -161,6 +165,7 @@ claim_pre(S) -> andalso S#state.plan == [] andalso S#state.staged_nodes /= []. %% make sure there is something sensible to do claim_args(S) -> + %% v2 does not take leaving nodes into account, but the model does [elements([v4]), S#state.nval]. claim(Ring, default, Nval) -> @@ -176,8 +181,9 @@ claim(Ring, v4, Nval) -> {riak_core_membership_claim, wants_claim_v2}, {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]). -claim_pre(S, [v4, _Nval]) -> - not known_hard(S) andalso (length(S#state.nodes) < S#state.ring_size div 2); +claim_pre(#state{sufficient = true} = S, [v4, _Nval]) -> + %% Sufficient conditions to actually succeed + sufficient_conditions(S); claim_pre(_, [_, _]) -> true. @@ -287,6 +293,12 @@ necessary_conditions(S) -> || S#state.nval == length(Locations) ] ). +sufficient_conditions(S) -> + Locations = to_config(S), + length(Locations) >= S#state.nval + 2 + andalso length(S#state.nodes) < S#state.ring_size div 2 + andalso lists:min(Locations) >= lists:max(Locations) - 2. + to_config(S) -> LocNodes = lists:foldl(fun(N, Acc) -> @@ -367,21 +379,33 @@ weight(_S, _Cmd) -> 1. %% -- Property --------------------------------------------------------------- prop_claim() -> + prop_claim([relaxed]). + +prop_claim(Options) -> + Relaxed = proplists:get_bool(relaxed, Options), + %% If relaxed, we restrict configurations to those that we can easily + %% determine have a solution (sufficient_condition). case ets:whereis(timing) of undefined -> ets:new(timing, [public, named_table, bag]); _ -> ok end, - ?FORALL({Nval, WithLocation}, {choose(2, 5), bool()}, - ?FORALL(Cmds, commands(?MODULE, with_location(initial_state(Nval), WithLocation)), + ?FORALL({Nval, RingSize, WithLocation}, {choose(2, 5), ringsize(), bool()}, + ?FORALL(Cmds, commands(?MODULE, initial_state(#{nval => Nval, + ring_size => RingSize, + sufficient => Relaxed, + with_location => WithLocation})), begin put(ring_nr, 0), {H, S, Res} = run_commands(Cmds), + Config = lists:sort(to_config(S)), measure(length, commands_length(Cmds), + features([{RingSize, Config, Nval} || WithLocation andalso + S#state.plan /= [] andalso Res == ok], aggregate_feats([claimed_nodes, ring_size, with_location, nr_nodes, moving, algorithm], call_features(H), check_command_names(Cmds, pretty_commands(?MODULE, Cmds, {H, S, Res}, - Res == ok)))) + Res == ok))))) end)). aggregate_feats([], _, Prop) -> Prop; @@ -391,9 +415,6 @@ aggregate_feats([Op | Ops], Features, Prop) -> aggregate_feats(Ops, Features, Prop)). -with_location(S, Bool) -> - S#state{with_location = Bool}. - location(S, N) when is_record(S, state) -> location(S#state.placements, N); location(LocNodes, N) -> @@ -481,20 +502,6 @@ as_ring(Kind, Term) when is_tuple(Term) -> as_ring(_, Term) -> lists:flatten(io_lib:format("~p", [Term])). - -known_hard(S) -> - lists:member({S#state.ring_size, lists:sort(to_config(S)), S#state.nval}, - [{16, [1,1,1,2,2,2], 5}, - {16, [1,1,1,1,3,3], 5}, - {16, [1,1,1,3,3], 4}, - {16, [1,1,1,1,2,3], 4}, - {16, [1,1,4,4], 3}, - {16, [1,2,2,2,3], 4}, - {128, [1,1,2,2,2,2], 5} - ]). - - - equal({X, Ls1}, {X, Ls2}) -> equal(Ls1, Ls2); equal([X|Ls1], [X|Ls2]) -> From be0451c61766f76efcdf8be3ae382d306910156e Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 17 May 2023 12:51:56 +0100 Subject: [PATCH 17/30] Revert "Long-running tests" This reverts commit 6527033e246367cb7d97a3059ccbf9caff4a107c. --- src/riak_core_claim_swapping.erl | 36 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index f43f5eb35..6856ad64d 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -287,11 +287,12 @@ location_t1_test_() -> {"[2, 2, 2, 2, 2] nval 4", {inparallel, [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), - location_claim_tester(n1, loc1, JoiningNodes, 128, 4), - location_claim_tester(n1, loc1, JoiningNodes, 256, 4), + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4) %% Don't test large rings in automated testing - location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) + %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) + %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) ]}}. location_t2_test_() -> @@ -307,7 +308,8 @@ location_t2_test_() -> location_claim_tester(n1, loc1, JoiningNodes, 128, 4), location_claim_tester(n1, loc1, JoiningNodes, 256, 4), location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), + location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) ]}}. location_t8_test_() -> @@ -319,15 +321,16 @@ location_t8_test_() -> {"[4, 3, 3, 3] nval 4", {inparallel, [location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) - %% Don't test large rings in automated testing - %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) + %% Don't test large rings in automated testing + %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), + %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) ]}}. location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> {"Ringsize "++integer_to_list(RingSize), - {timeout, 600, + {timeout, 120, fun() -> io:format( "Testing NodeList ~w with RingSize ~w~n", @@ -388,6 +391,7 @@ compute_failures(Mappings, TargetN, RClaim) -> Failures. + location_multistage_t1_test_() -> %% This is a tricky corner case where we would fail to meet TargetN for %% locations if joining all 9 nodes in one claim (as old sequential_claim will @@ -404,8 +408,9 @@ location_multistage_t1_test_() -> location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) + location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4) + %% location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) + %% location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) ]}. @@ -491,13 +496,12 @@ location_typical_expansion_test_() -> {inparallel, [location_typical_expansion_tester(64), location_typical_expansion_tester(128), - location_typical_expansion_tester(256), - location_typical_expansion_tester(512), - location_typical_expansion_tester(1024) + location_typical_expansion_tester(256) + %% location_typical_expansion_tester(512) ]}}. location_typical_expansion_tester(RingSize) -> - {timeout, 600, + {timeout, 120, {"Ringsize "++integer_to_list(RingSize), fun() -> N1 = l1n1, From c43d0a660f015273a2c6c71ac4e7a3193059ec03 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 17 May 2023 14:28:31 +0100 Subject: [PATCH 18/30] Test adjustments --- src/riak_core_claim_swapping.erl | 11 ++++++----- src/riak_core_vnode_proxy.erl | 8 +++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 0fd018d44..4db03b48e 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -439,9 +439,9 @@ location_multistage_t1_test_() -> location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4) - %% location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4) - %% location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) + location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4), + location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) ]}. @@ -527,8 +527,9 @@ location_typical_expansion_test_() -> {inparallel, [location_typical_expansion_tester(64), location_typical_expansion_tester(128), - location_typical_expansion_tester(256) - %% location_typical_expansion_tester(512) + location_typical_expansion_tester(256), + location_typical_expansion_tester(512), + location_typical_expansion_tester(1024) ]}}. location_typical_expansion_tester(RingSize) -> diff --git a/src/riak_core_vnode_proxy.erl b/src/riak_core_vnode_proxy.erl index dfd907a3e..2516aeca5 100644 --- a/src/riak_core_vnode_proxy.erl +++ b/src/riak_core_vnode_proxy.erl @@ -454,7 +454,13 @@ overload_test_() -> erlang:process_info(VnodePid, message_queue_len), %% Threshold + 2 unanswered vnode_proxy_ping (one %% for first ping, second after process_info check) - ?assert(L =< (?DEFAULT_OVERLOAD_THRESHOLD + 2)) + %% +1 for luck required on faster machines? + io:format( + user, + "~nMessage Queue ~w threshold ~w~n", + [L, ?DEFAULT_OVERLOAD_THRESHOLD] + ), + ?assert(L =< (?DEFAULT_OVERLOAD_THRESHOLD + 3)) end } end From b6dcbc25872e4f598418c21083597c330902e0da Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 18 May 2023 14:41:37 +0100 Subject: [PATCH 19/30] Test adjustments --- src/riak_core_claim_binring_alg.erl | 6 +- src/riak_core_claim_swapping.erl | 213 ++++++++++++++-------------- 2 files changed, 109 insertions(+), 110 deletions(-) diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index b09ddebe7..6174ea4e5 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -788,13 +788,13 @@ prop_no_locations() -> end). config_gen() -> - ?LET(N, choose(1,7), vector(N, choose(1,8))). + ?LET(N, choose(1,7), ?LET(M, choose(2, 6), vector(N, choose(M, M + 2)))). prop_brute_force_optimize() -> in_parallel( - ?FORALL({Size, Config, NValsMap}, {elements([16, 32, 64, 128, 256, 512, 1024]), + ?FORALL({Size, Config, NValsMap}, {elements([128, 256, 512]), config_gen(), - ?LET(N, choose(2,5), #{node => N, location => default(N, choose(2, N))})}, + ?LET(N, choose(3, 4), #{node => N, location => default(N, choose(2, N))})}, ?IMPLIES(length(Config) >= maps:get(location, NValsMap), begin NVals = to_nvals(NValsMap), diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 4db03b48e..71fb2ee4c 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -315,13 +315,12 @@ location_t1_test_() -> ], {"[2, 2, 2, 2, 2] nval 4", {inparallel, - [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), - location_claim_tester(n1, loc1, JoiningNodes, 128, 4), - location_claim_tester(n1, loc1, JoiningNodes, 256, 4) - %% Don't test large rings in automated testing - %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) - %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + [ + location_claim_tester(n1, loc1, JoiningNodes, 64, 4), + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4), + location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) ]}}. location_t2_test_() -> @@ -333,12 +332,12 @@ location_t2_test_() -> ], {"[2, 2, 2, 2] nval 4", {inparallel, - [location_claim_tester(n1, loc1, JoiningNodes, 64, 4), - location_claim_tester(n1, loc1, JoiningNodes, 128, 4), - location_claim_tester(n1, loc1, JoiningNodes, 256, 4), - location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), - location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + [ + location_claim_tester(n1, loc1, JoiningNodes, 64, 4), + location_claim_tester(n1, loc1, JoiningNodes, 128, 4), + location_claim_tester(n1, loc1, JoiningNodes, 256, 4), + location_claim_tester(n1, loc1, JoiningNodes, 512, 4), + location_claim_tester(n1, loc1, JoiningNodes, 1024, 4) ]}}. location_t8_test_() -> @@ -347,14 +346,14 @@ location_t8_test_() -> {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}], - {"[4, 3, 3, 3] nval 4", + {"[4, 3, 3, 3] nval 3", {inparallel, - [location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3), - location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3) - %% Don't test large rings in automated testing - %% location_claim_tester(n1, loc1, JoiningNodes, 512, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 1024, 4), - %% location_claim_tester(n1, loc1, JoiningNodes, 2048, 4) + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 3) ]}}. location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> @@ -436,17 +435,15 @@ location_multistage_t1_test_() -> ], {inparallel, [ - location_multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(1024, JoiningNodes, 4, l5n9, loc5, 4), - location_multistage_claim_tester(2048, JoiningNodes, 4, l5n9, loc5, 4) + multistage_claim_tester(64, JoiningNodes, 4, l5n9, loc5, 4), + multistage_claim_tester(128, JoiningNodes, 4, l5n9, loc5, 4), + multistage_claim_tester(256, JoiningNodes, 4, l5n9, loc5, 4), + multistage_claim_tester(512, JoiningNodes, 4, l5n9, loc5, 4) ]}. -location_multistage_claim_tester(RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> - {timeout, 240, +multistage_claim_tester(RingSize, JoiningNodes, TargetN, NewNode, NewLocation, VerifyN) -> + {timeout, 300, {"Ringsize " ++ integer_to_list(RingSize), fun() -> SW0 = os:timestamp(), @@ -522,91 +519,97 @@ location_multistage_claim_tester(RingSize, JoiningNodes, TargetN, NewNode, NewLo ) end}}. +location_typical_expansion_longrunning_test_() -> + %% Long-running as one step will require brute-force + {timeout, + 300, + {"RingSize 2048", fun() -> typical_expansion_tester(2048) end}}. + location_typical_expansion_test_() -> {"Typical expansion", {inparallel, - [location_typical_expansion_tester(64), - location_typical_expansion_tester(128), - location_typical_expansion_tester(256), - location_typical_expansion_tester(512), - location_typical_expansion_tester(1024) + [ + {timeout, 60, + {"Ringsize 64", fun() -> typical_expansion_tester(64) end}}, + {timeout, 60, + {"Ringsize 128", fun() -> typical_expansion_tester(128) end}}, + {timeout, 60, + {"Ringsize 256", fun() -> typical_expansion_tester(256) end}}, + {timeout, 60, + {"Ringsize 512", fun() -> typical_expansion_tester(512) end}} ]}}. -location_typical_expansion_tester(RingSize) -> - {timeout, 120, - {"Ringsize "++integer_to_list(RingSize), - fun() -> - N1 = l1n1, - N1Loc = loc1, - TargetN = 4, - InitJoiningNodes = - [{l1n2, loc1}, - {l2n3, loc2}, {l2n4, loc2}, - {l3n5, loc3}, {l3n6, loc3}, - {l4n7, loc4}, {l4n8, loc4}], +typical_expansion_tester(RingSize) -> + N1 = l1n1, + N1Loc = loc1, + TargetN = 4, + InitJoiningNodes = + [{l1n2, loc1}, + {l2n3, loc2}, {l2n4, loc2}, + {l3n5, loc3}, {l3n6, loc3}, + {l4n7, loc4}, {l4n8, loc4}], - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|InitJoiningNodes], RingSize] - ), - Params = [{target_n_val, TargetN}], - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), + Params = [{target_n_val, TargetN}], + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), - RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), - {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), + RClaimInit = add_nodes_to_ring(R1, N1, InitJoiningNodes, Params), + {RingSize, MappingsInit} = riak_core_ring:chash(RClaimInit), - check_for_failures(MappingsInit, TargetN, RClaimInit), + check_for_failures(MappingsInit, TargetN, RClaimInit), - Stage1Ring = commit_change(RClaimInit), + Stage1Ring = commit_change(RClaimInit), - RClaimStage2 = add_node(Stage1Ring, N1, l5n9, loc5, Params), - {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), - check_for_failures(Mappings2, TargetN, RClaimStage2), - Stage2Ring = commit_change(RClaimStage2), + RClaimStage2 = add_node(Stage1Ring, N1, l5n9, loc5, Params), + {RingSize, Mappings2} = riak_core_ring:chash(RClaimStage2), + check_for_failures(Mappings2, TargetN, RClaimStage2), + Stage2Ring = commit_change(RClaimStage2), - RClaimStage3 = add_node(Stage2Ring, N1, l5n10, loc5, Params), - {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), - check_for_failures(Mappings3, TargetN, RClaimStage3), - Stage3Ring = commit_change(RClaimStage3), - - RClaimStage4 = add_node(Stage3Ring, N1, l6n11, loc6, Params), - {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage4), - check_for_failures(Mappings4, TargetN, RClaimStage4), - Stage4Ring = commit_change(RClaimStage4), - - RClaimStage5 = add_node(Stage4Ring, N1, l6n12, loc6, Params), - {RingSize, Mappings5} = riak_core_ring:chash(RClaimStage5), - check_for_failures(Mappings5, TargetN, RClaimStage5), - Stage5Ring = commit_change(RClaimStage5), - - RClaimStage6 = add_node(Stage5Ring, N1, l1n13, loc1, Params), - {RingSize, Mappings6} = riak_core_ring:chash(RClaimStage6), - check_for_failures(Mappings6, TargetN, RClaimStage6), - Stage6Ring = commit_change(RClaimStage6), - - RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), - {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), - check_for_failures(Mappings7, TargetN, RClaimStage7), - Stage7Ring = commit_change(RClaimStage7), - - RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), - {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), - check_for_failures(Mappings8, TargetN, RClaimStage8), - Stage8Ring = commit_change(RClaimStage8), - - RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), - {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), - check_for_failures(Mappings9, TargetN, RClaimStage9), - _Stage9Ring = commit_change(RClaimStage9) - end}}. + RClaimStage3 = add_node(Stage2Ring, N1, l5n10, loc5, Params), + {RingSize, Mappings3} = riak_core_ring:chash(RClaimStage3), + check_for_failures(Mappings3, TargetN, RClaimStage3), + Stage3Ring = commit_change(RClaimStage3), + + RClaimStage4 = add_node(Stage3Ring, N1, l6n11, loc6, Params), + {RingSize, Mappings4} = riak_core_ring:chash(RClaimStage4), + check_for_failures(Mappings4, TargetN, RClaimStage4), + Stage4Ring = commit_change(RClaimStage4), + + RClaimStage5 = add_node(Stage4Ring, N1, l6n12, loc6, Params), + {RingSize, Mappings5} = riak_core_ring:chash(RClaimStage5), + check_for_failures(Mappings5, TargetN, RClaimStage5), + Stage5Ring = commit_change(RClaimStage5), + + RClaimStage6 = add_node(Stage5Ring, N1, l1n13, loc1, Params), + {RingSize, Mappings6} = riak_core_ring:chash(RClaimStage6), + check_for_failures(Mappings6, TargetN, RClaimStage6), + Stage6Ring = commit_change(RClaimStage6), + + RClaimStage7 = add_node(Stage6Ring, N1, l2n14, loc2, Params), + {RingSize, Mappings7} = riak_core_ring:chash(RClaimStage7), + check_for_failures(Mappings7, TargetN, RClaimStage7), + Stage7Ring = commit_change(RClaimStage7), + + RClaimStage8 = add_node(Stage7Ring, N1, l3n15, loc3, Params), + {RingSize, Mappings8} = riak_core_ring:chash(RClaimStage8), + check_for_failures(Mappings8, TargetN, RClaimStage8), + Stage8Ring = commit_change(RClaimStage8), + + RClaimStage9 = add_node(Stage8Ring, N1, l4n16, loc4, Params), + {RingSize, Mappings9} = riak_core_ring:chash(RClaimStage9), + check_for_failures(Mappings9, TargetN, RClaimStage9), + _Stage9Ring = commit_change(RClaimStage9). add_node(Ring, Claimant, Node, Location, Params) -> - RingC = add_nodes_to_ring(Ring, Claimant, [{Node, Location}], Params), + {ClaimTime, RingC} = + timer:tc( + fun() -> + add_nodes_to_ring(Ring, Claimant, [{Node, Location}], Params) end + ), OwnersPre = riak_core_ring:all_owners(Ring), OwnersPost = riak_core_ring:all_owners(RingC), @@ -627,19 +630,15 @@ add_node(Ring, Claimant, Node, Location, Params) -> NodeCounts = lists:map(fun({_N, C}) -> C end, dict:to_list(NodeCountD)), io:format( - % user, - "NodeCounts~w~n", - [dict:to_list(NodeCountD)]), - io:format( - % user, - "Adding node ~w in location ~w - ~w transfers ~w max ~w min vnodes~n", + "Adding node ~w in location ~w - ~w transfers ~w max ~w min vnodes" + " ClaimTime ~w ms~n", [Node, Location, - length(Next), lists:max(NodeCounts), lists:min(NodeCounts)]), + length(Next), lists:max(NodeCounts), lists:min(NodeCounts), + ClaimTime div 1000]), ?assert( (lists:min(NodeCounts) == (lists:max(NodeCounts) - 1)) or (lists:min(NodeCounts) == lists:max(NodeCounts)) ), - % ?assert(length(Next) =< ExpectedTransferMax), RingC. commit_change(Ring) -> From 2dd845fdf8937766f2e625d0dc11b62390470911 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 19 May 2023 19:07:06 +0100 Subject: [PATCH 20/30] Add support for configured target_location_n_val --- src/riak_core_membership_claim.erl | 16 +++++++++------- src/riak_core_membership_leave.erl | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index b35f5f48e..b48b51e35 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -209,17 +209,19 @@ default_choose_params() -> default_choose_params([]). default_choose_params(Params) -> - case proplists:get_value(target_n_val, Params) of - undefined -> - TN = get_target_n(), - [{target_n_val, TN} | Params]; - _-> - Params - end. + TN = proplists:get_value(target_n_val, Params, get_target_n()), + TLN = + proplists:get_value( + target_location_n_val, Params, get_target_location_n(TN)), + lists:ukeysort( + 1, [{target_n_val, TN}, {target_location_n_val, TLN}] ++ Params). get_target_n() -> app_helper:get_env(riak_core, target_n_val, ?DEF_TARGET_N). +get_target_location_n(TargetN) -> + app_helper:get_env(riak_core, target_location_n_val, TargetN). + %% =================================================================== %% Claim Function Implementations diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index badccc7ce..886b31646 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -153,7 +153,7 @@ simple_transfer([{P, ExitingNode}|Rest], lists:sublist(ForwardL, Steps) end, fun({Node, _Count}) -> - %% Nodes will remian as candidates if they are not in the list + %% Nodes will remain as candidates if they are not in the list %% of unsafe nodes not lists:keymember(Node, 2, UnsafeNodeTuples) end From 0c05dc466491c5286335c5a9406e9b9860330488 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Sat, 20 May 2023 00:39:57 +0100 Subject: [PATCH 21/30] Memoise fixes The cache of v4 solutions is required by the ring_manager and the claimant - so specifically update both of these processes each time. Otherwise cache will be missed when the ring_manager calls to riak_core_claimant:ring_changed/2. There is a fix to the last gasp check before writing the ring file. prune_write_notify_ring function does not care if the write of a ring errors - so error for this function rather than crashing the ring manager. This otherwise causes instability in location tests. --- src/riak_core_claim_swapping.erl | 72 +++++++++++++++++++++++------- src/riak_core_claimant.erl | 65 ++++++++++++++++++--------- src/riak_core_membership_claim.erl | 5 +++ src/riak_core_ring_manager.erl | 18 ++++++-- 4 files changed, 120 insertions(+), 40 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 71fb2ee4c..7d639ec07 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -70,9 +70,14 @@ memoize(Registry, Key, Fun) -> end, case lists:keyfind({Registry, Key}, 1, V4Solutions) of {{Registry, Key}, Solution} -> - lager:info("Retrieved solve from memory for ~w", [Registry]), + lager:info( + "Retrieved solve from cache of ~w for ~w", + [length(V4Solutions), Registry]), Solution; _ -> + lager:info( + "No cache hit from cache of ~w for ~w", + [length(V4Solutions), Registry]), Value = Fun(), riak_core_claimant:update_v4_solutions({{Registry, Key}, Value}), Value @@ -356,24 +361,59 @@ location_t8_test_() -> location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 3) ]}}. +location_t9_test_() -> + JoiningNodes = + [{l1n2, loc1}, {l1n3, loc1}, {l1n4, loc1}, + {l2n1, loc2}, {l2n2, loc2}, {l2n3, loc2}, + {l3n1, loc3}, {l3n2, loc3}, {l3n3, loc3}, + {l4n1, loc4}, {l4n2, loc4}, {l4n3, loc4}], + {"[4, 3, 3, 3] nval 4 location nval 3", + {inparallel, + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 4, 3) + ]}}. + +location_t10_test_() -> + JoiningNodes = + [{l1n2, loc1}, {l2n3, loc2}, {l3n5, loc3}, {l4n7, loc4}, {l4n8, loc4}], + {"[2, 1, 1, 2] nval 4 location nval 3", + {inparallel, + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 4, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 4, 3) + ]}}. + location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> + location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN, TargetN). + +location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN, TargetLN) -> {"Ringsize "++integer_to_list(RingSize), - {timeout, 120, + {timeout, 300, fun() -> - io:format( - "Testing NodeList ~w with RingSize ~w~n", - [[{N1, N1Loc}|NodeLocList], RingSize] - ), - R1 = - riak_core_ring:set_node_location( - N1, - N1Loc, - riak_core_ring:fresh(RingSize, N1)), - - RClaim = add_nodes_to_ring(R1, N1, NodeLocList, [{target_n_val, TargetN}]), - {RingSize, Mappings} = riak_core_ring:chash(RClaim), - - check_for_failures(Mappings, TargetN, RClaim) + io:format( + "Testing NodeList ~w with RingSize ~w~n", + [[{N1, N1Loc}|NodeLocList], RingSize] + ), + R1 = + riak_core_ring:set_node_location( + N1, + N1Loc, + riak_core_ring:fresh(RingSize, N1)), + + RClaim = + add_nodes_to_ring( + R1, N1, NodeLocList, + [{target_n_val, TargetN}, {target_location_n_val, TargetLN}]), + {RingSize, Mappings} = riak_core_ring:chash(RClaim), + + check_for_failures(Mappings, TargetLN, RClaim) end}}. add_nodes_to_ring(Ring, Claimant, NodeLocList, Params) -> diff --git a/src/riak_core_claimant.erl b/src/riak_core_claimant.erl index 08274495c..e95fd0d24 100644 --- a/src/riak_core_claimant.erl +++ b/src/riak_core_claimant.erl @@ -44,7 +44,8 @@ get_bucket_type/3, bucket_type_iterator/0, set_node_location/2, - update_v4_solutions/1]). + update_v4_solutions/1, + update_v4_cache/1]). -export([reassign_indices/1]). % helpers for claim sim %% gen_server callbacks @@ -70,6 +71,8 @@ #{location := pos_integer(), node := pos_integer()}}}, binary()}. +-export_type([v4_solution/0]). + -record(state, { last_ring_id, %% The set of staged cluster changes @@ -84,11 +87,7 @@ %% Random number seed passed to remove_node to ensure the %% current randomized remove algorithm is deterministic %% between plan and commit phases - seed, - - %% List of v4 solutions - to be copied to the process memory of - %% the riak_core_ring_manager when - v4_solutions = [] :: list(v4_solution())}). + seed}). -define(ROUT(S,A),ok). %%-define(ROUT(S,A),?debugFmt(S,A)). @@ -256,7 +255,16 @@ bucket_type_iterator() -> riak_core_metadata:iterator(?BUCKET_TYPE_PREFIX, [{default, undefined}, {resolver, fun riak_core_bucket_props:resolve/2}]). - +-spec update_v4_cache(v4_solution()) -> ok. +update_v4_cache(V4Solution) -> + Cache = + case get(v4_solutions) of + RetrievedCache when is_list(RetrievedCache) -> + RetrievedCache; + _ -> + [] + end, + put(v4_solutions, lists:ukeysort(1, [V4Solution|Cache])). %%%=================================================================== %%% Claim sim helpers until refactor @@ -362,11 +370,9 @@ handle_call(_Request, _From, State) -> {reply, Reply, State}. handle_cast({update_v4_solutions, V4Solution}, State) -> - {noreply, - State#state{ - v4_solutions = - lists:ukeysort(1, [V4Solution|State#state.v4_solutions]) - }}; + update_v4_cache(V4Solution), + ok = riak_core_ring_manager:update_v4_solutions(V4Solution), + {noreply, State}; handle_cast(_Msg, State) -> {noreply, State}. @@ -419,7 +425,6 @@ generate_plan([], _, State) -> %% There are no changes to apply {{ok, [], []}, State}; generate_plan(Changes, Ring, State=#state{seed=Seed}) -> - put(v4_solutions, State#state.v4_solutions), case compute_all_next_rings(Changes, Seed, Ring) of {error, invalid_resize_claim} -> {{error, invalid_resize_claim}, State}; @@ -430,6 +435,7 @@ generate_plan(Changes, Ring, State=#state{seed=Seed}) -> {Reply, State2} end. + %% @private %% @doc Commit the set of staged cluster changes. See {@link commit/0} %% for additional details. @@ -455,12 +461,10 @@ maybe_commit_staged(State) -> %% @private maybe_commit_staged(Ring, State=#state{changes=Changes, seed=Seed}) -> Changes2 = filter_changes(Changes, Ring), - put(v4_solutions, State#state.v4_solutions), case compute_next_ring(Changes2, Seed, Ring) of {error, invalid_resize_claim} -> {ignore, invalid_resize_claim}; {ok, NextRing} -> - erase(v4_solutions), maybe_commit_staged(Ring, NextRing, State) end. @@ -1117,12 +1121,14 @@ change({leave, Node}, Ring) -> Members = riak_core_ring:all_members(Ring), lists:member(Node, Members) orelse throw(invalid_member), Ring2 = riak_core_ring:leave_member(Node, Ring, Node), - Ring2; + Ring3 = update_location_on_leave(Ring2), + Ring3; change({remove, Node}, Ring) -> Members = riak_core_ring:all_members(Ring), lists:member(Node, Members) orelse throw(invalid_member), Ring2 = riak_core_ring:remove_member(Node, Ring, Node), - Ring2; + Ring3 = update_location_on_leave(Ring2), + Ring3; change({{replace, _NewNode}, Node}, Ring) -> %% Just treat as a leave, reassignment happens elsewhere Ring2 = riak_core_ring:leave_member(Node, Ring, Node), @@ -1144,6 +1150,18 @@ change({abort_resize, _Node}, Ring) -> change({{set_location, Location}, Node}, Ring) -> riak_core_ring:set_node_location(Node, Location, Ring). +update_location_on_leave(Ring) -> + LocationAware = + riak_core_location:has_location_set_in_cluster( + riak_core_ring:get_nodes_locations(Ring)), + case LocationAware of + true -> + riak_core_ring:force_location_changed(Ring, true); + false -> + Ring + end. + + internal_ring_changed(Node, CState) -> {Changed, CState5} = do_claimant(Node, CState, fun log/2), inform_removed_nodes(Node, CState, CState5), @@ -1486,10 +1504,15 @@ rebalance_ring(CState) -> rebalance_ring([], CState) -> SW = os:timestamp(), CState2 = riak_core_membership_claim:claim(CState), - lager:info( - "Claim algorithm request completed in claim_time=~w ms", - [timer:now_diff(os:timestamp(), SW) div 1000] - ), + case CState2 of + CState -> + ok; + CState2 -> + lager:info( + "Claim algorithm completed in claim_time=~w ms", + [timer:now_diff(os:timestamp(), SW) div 1000] + ) + end, Owners1 = riak_core_ring:all_owners(CState), Owners2 = riak_core_ring:all_owners(CState2), Owners3 = lists:zip(Owners1, Owners2), diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index b48b51e35..34195fc79 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -129,6 +129,9 @@ claim(Ring, {WMod, WFun}=Want, Choose) -> true -> case riak_core_ring:has_location_changed(Ring) of true -> + lager:info( + "Claim requested with ~w due to location change", + [Choose]), [HeadMember|_Rest] = Members, choose_new_ring( riak_core_ring:clear_location_changed(Ring), @@ -138,6 +141,8 @@ claim(Ring, {WMod, WFun}=Want, Choose) -> Ring end; false -> + lager:info( + "Claim requested with ~w due to initial wants", [Choose]), lists:foldl( fun(Node, Ring0) -> claim_until_balanced(Ring0, Node, Want, Choose) diff --git a/src/riak_core_ring_manager.erl b/src/riak_core_ring_manager.erl index d6c72e0ae..b957e365c 100644 --- a/src/riak_core_ring_manager.erl +++ b/src/riak_core_ring_manager.erl @@ -88,7 +88,8 @@ run_fixups/3, set_cluster_name/1, stop/0, - is_stable_ring/0]). + is_stable_ring/0, + update_v4_solutions/1]). -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). @@ -236,8 +237,12 @@ do_write_ringfile(Ring) -> Dir -> FN = generate_ring_filename( Dir, app_helper:get_env(riak_core, cluster_name)), - false = riak_core_ring:check_lastgasp(Ring), - do_write_ringfile(Ring, FN) + case riak_core_ring:check_lastgasp(Ring) of + false -> + do_write_ringfile(Ring, FN); + _ -> + {error, last_gasp} + end end. generate_ring_filename(Dir, ClusterName) -> @@ -335,6 +340,10 @@ prune_ringfiles() -> stop() -> gen_server:cast(?MODULE, stop). +-spec update_v4_solutions(riak_core_claimant:v4_solution()) -> ok. +update_v4_solutions(V4Solution) -> + gen_server:cast(?MODULE, {update_v4_solutions, V4Solution}). + %% =================================================================== %% gen_server callbacks @@ -435,6 +444,9 @@ handle_call(is_stable_ring, _From, State) -> {IsStable, _DeltaMS} = is_stable_ring(State), {reply, IsStable, State}. +handle_cast({update_v4_solutions, V4Solution}, State) -> + riak_core_claimant:update_v4_cache(V4Solution), + {noreply, State}; handle_cast(stop, State) -> {stop,normal,State}; From 057b17ed85c411bfaeb3b72541ef70c1befd70ae Mon Sep 17 00:00:00 2001 From: Thomas Arts Date: Tue, 23 May 2023 12:28:16 +0200 Subject: [PATCH 22/30] Example configurations saves in source format (#1005) * Remove pre-computed test suite * cleanup * Make claim_eqc tests not fail on weird configs by supplying a diverse list of options --- claim.suite | Bin 347393 -> 0 bytes src/riak_core_claim_binring_alg.erl | 6 +- test/riak_core_claim_eqc.erl | 385 +++++++++++++++++++++++++++- 3 files changed, 385 insertions(+), 6 deletions(-) delete mode 100644 claim.suite diff --git a/claim.suite b/claim.suite deleted file mode 100644 index c4e685902a07612cec60c4b41341ce4c5767aaa9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 347393 zcmeHQ-E!N;k|qhsNgU^Y&#ArGt>fC-y^mxkaqj#kE`(%Ck|ipU6<@n*Uw9v3d%A%M z0Su?xJpere_FibumtXxmxBA_7bGuk9{_kq#{@?Pd zs-M4ItX8Xfb>seE1%I%rpKlgtj}L!s-Jg8!E^vN#zuDey7C$^(*Kgmf@8MGG+xq(M z&Gyf=`~2*Bcl~(vCH&&M$NF(wKfk%zEWY?_x7q%Fz1wc~|2OvYwq9@Vw!6o7_z!SD&hQp)@7EWb#cFra8C|xcOZVA} z&gkXM&EO7J>>WI-Hyirb*Skme*H`UD>9248ZSV{R7q8fhf9@{6a}RrPVfY8&L&JaL z@Y-j}YtwUyU;8}e+ArhRe!^b+9H*LlATUidfD2xh#4DpLb_aXB3^l?Qgzt zlj0|v&iCsnr!zg}iKp}HAsoMM{z|t5RB-&-;JAK0giM3u;WJ5wW0=1KI4*1gmjK5$ zo!3W!;}1hP{*VgC9}JG`4@2lNI37NeWH^TTJAmWDCU6OGY}5HDaQquY7K;HQmmeuZ;QUW_ksHV-I@#!+zwl;qrXRe1cd3+`e zMSR9)+u>`|?1%@ApR?B<G%PnKng2&ekb@hOwz5|uy;Z-T?i zsK_Nc(B5CL3;oIZ{_gG?YmrOGD;L8#`;@)N)7x$Re*1Ur(Ou}*_{UXy)}87-Tm$U^1;Inw!+@etx7_ES6x%~5+uG}B3G4@;nWTeo04RUq|0oIs&qi{P3nfil5nf-ShQaI=V%^+0XL zcEcI2#kWzOV#)Bx3Dk#B0=3_ItxBL`dj~9m>L*Ymwq;AyRE{dp5~%CDy}iTT9w=C& zg9K`SiK-GPTcRH$XhvM3mOxnoWeJo!(}T1})(9W{X(8`vy?>YRy!dW@0u)&qqn z^n_!cjGH?TCr~KZBDkhXpiHotV6z0OX*^QxE3srgTds6grMNL_Fg+ zObrjHhf_@YCx}hv0-FFdalBy%WivVxYowF4?H3~oT z?;^m_%XSYej(%!KnDcl^h@)RRpS|jge%%{YyNe$oUFg4JAi=?Tnj;P+0~ALiG(xm;QM57LW@E4g)^T1(}GZV zk3ci|;$J5(`2xamh;SfS4jswzm0KHzaPSbEB%)=|!Uc{zmcR(6vBwe`dMu%#AsikJ zlH)n~NM(#!&>|d7l=xE472se_!=s6D3NN$52-hr$_{DjI3tyaYpJ6Qih??-lMcAx5 zTd{pB zZvP21=loNmp}qgllX4tJw8%{`Qld7cWfXrZ3CPVJt*Qm?W@5*0Jb&O`RuNIHX@H5Q zfa5o-S!@8p$Fm8+fS39>wAkS9H|QQJ+8_wQ(~8MQEi@C~$FHpw)ubNc*VcvxG?$n} zLW|6FaYpuPp_x<^e5bLmQwz<_nu=eX*VGAuoj=A~5T<}$KP=b_z0Aa?!umu&P0h{mUJc};wJ;v5CUgdI;`zZi@jHtX;k~_W;h#i$XXHD_ zNR}1Zh+Nb*IKYe8Vos`2!FL$Dwl+?bkczfOCGkkwI1#5iQ(H@j6WMKtTKh#ykwN{m zG?9rawf0LyhS~}hJr-t;YwZ`_`Q?xC1;nR`VO2&<%iQ@lBSn=(lc9M#^LF0b8C7=9 zJ2R>rMyy0QOdCp;R4F3)EU8lD8w;SysxqqF*#_48HBIQEH?SI28dcWqIox3rQZ7}} zoin{sX;jIWZLO=E2~{r7A`LJWvBCjV7O~cEv+3A0GVG8cI1TnpgoKC{0t*H`bIe$c z4eQNj=%>k*GD)SD4e^=JAY+Z~UnqbLEn~He)iTzwNiwbskb-gL@wsv)&JlK(AMQ0Z zt~>$f*mfIP$QnI#gpu4iw(d;Hl|(sAtN{P9YbUWG!?Y*Z$jT?U37xELm#nrs2fjzR zapv39k|F62rb|{!h9YsJ4O!{2FbhcAor8DICAd5phO-wEAj}u~?KoYsWizcMxR&7h z1lM@DQ|&CcOSViNCN^VQaI@rLk%GvQhlwhgm7hNkLtl|O-ji;@Eo$o~v);_{4#UWV zz9P0~DIxwfrrq6dwjJF>G+wUbWqjkQJKoqK2?b=!=6KEVn&S=UX6ASUJxf3blO@mP zI$pYSrmUXkcqiiFnK)kPhbr3f8V?%}8xPNmhef22V+FWe9ws(}`;T=wr13DbCrj&x z=fT70OMDCN*^P@pR##PhRzGjP4EHCazxV3w=H{lk{#6Bi3-7j%uAi%VUq94Kwxa#? zkK1~^z1!{{-?_V6x*z}9_SK^LzvztY_2sTLgnJ8a`^cKL*d6+}ywj)tlnfgB_q@~4 zZ~mp>tM*b5ronCeT2#i(U{X@X&4i2Et}cny({^>?wV5LX6Ebc-ht`JN#Er}o*OEG2 z{O34*Y5Tfp5ivG0t)9#jtF?Vy&3cbN7M|4yZzOT-akWC|2daAhc5%MMm-!lfwMdI3 z=usI*9+U|l+TWxFy*Gj-SmdF%?LUE*OaJhq?x#jyjn*>y8m~kvVwg1gVi-nCUw{ET z@n%V1&0>!;Oab(@1OlrM(bty_yvBS5q`5B_+0m@`L_tIEbDGuM@qaKla{J%leMmI!I-3ogqR!;;(=t(6H1c230#bCdE>^I3w4!SSkW)9!tZtD*|fX z3rK((aQZM%e`P>zKn>~-ggVXl)5&ZK1nP!q<7!>2dQq)2idrjrMJ%8NY7q-)fm*}@ zJWwaF0AF(uuz({0C^wf|c>Y%_D6H@dSWQzt)99o9rcmR&@Kp{PUKCZ9Hje1-)-jF< zonDNP1WPiOp=l{X#6LVmMEK_g+VwA3gf z^(?8e0g|{zD}fqeSFE{45BaFPt(nCI_XNN84=JG~~7s1@{(cG{I zXcJ;*wuwrcI+x4JqP#pyH%!pdwrRMGCeXmIb;HMFW%rhx!(CM7hT(hlhayuF*OCd2 z{JB6)+O_wg>v8A~eF{#$?D>y|p8uNF-2S&T^t**<=~5r8@_Pj3veS6R+!DE}D3)39FbGx|I@U zhok#`fInlHq7A7_rDYLCWK7EpMWoTPRfjj-^C}5j%J;*|vq*3B_YO(z#YN3a7zo%} za>0@LVc-y-aC?_dQ0iSet`&Q2tn@YWTH=*do|BHMV7zv|U0DoFYjbFT0N!h-@h(H^ zT01N<@>)Ag7iWyTw)>f&1)h9Nx@zq(-8nPO`L3*l*6MHBy^f;R>ascATC4N5+7@az zp*8?f41$cr$8Pa567N}1Bk@Rc8a$lX^3;+z$qz=AjX~nBZ@X+3x4!LsS-cG~%$fVw zNuql;QXh7oF$YwpCE`ZvM(PjSH*0si!JF-$>q7-A;@yczy+OgaVif%rb6oKrTKN zwNCwbr0!0%KD^Vb08+Qswql%mD+q9Qm;DA@2CPc%ytS95k1IT$gW@A1pjm_`yU7r`J?B z7s`+B95>20$~VgIZjr&DnQ5j?a`}wu%+ci&M=+i<$42_nIiI2BG@H-#x@$R~o;fpF z%;pD=q8X!H?_&Mcjq8Y6T~+m}e%_>t?oXaMTQ=s_xYf_OL3O1I!+djI#qIzFH z)Jt|(z|%i&>-F|-yL)`+?r!OReBC-n^hf@1793r6KJ!Pjkdx|smcv_|OKCcnlvcHj zC(8V^lt&k5_%vbf7{5(Q%Xv6snH$$uvuQQNO9|R!n^s}`+WgkB!Db+bB3CPf0D!F5 zZx`;Rf%_QtdJn^%x`qC+81~x1)E9s4HrwB?ciYYW|AxP4Yz#w~^&2Ng#xPv2zKaG& ze$Oh7K5l#Oq$u@zSh3N*7mxmF@u`3Y!p2{bX;5&#hQ$P$g5iBF6wt*PoJqn0=Eiv}2yZbiUa))^3YHJ4g5`q= zmS(wl`9u@^a#F$a!30b5*cl+vqEzvM#Vz)pVA)^nD#3Ex#cqN{Bv=dzOt8#Tu>1y5 z>tdi;@J|zE3tq#N%O_AS^yCK_yrnIn6aPeQ`$*nwY7a=ja-{S>c8fgsT3n3E+DaEXoyEO=pN z?I)QR*j#70vhc@WBOWp#rdGa)v=NUh2elEuyJ(D^G!-24chGccD!s5=j=7b_#F2@k zH|u-JY1;_C3_#{+!VJXRQAuw1A8hPfy?+UC1!?8}cEBpOAJ9N-${KGgnHtMvF)}oh zNL1dbIXnX~7R2DC#Rrubdb7MYHJg2Gu@Kik{Fuf*G73g(A8~;)(YhA2B!H+@UU=V3 z^Nu$D#`}i9AZDqx_E7?eTK6cjro4Mhu%`UULxMaF-W69eAeuFupEYH`*8p<3Kh^*; zw8dl$fLxuQAH+ulpS%Sl8gn}ZXKMw9#xgUf873)2M%ii=qC|aa0Z^howF*(9K0mQQ zjH%DoT)B3jK&u;nG^Id&F3*ZaSF_{CvEv0+a}ih!Y+XgcSBgsoTanf0#byj_nSB7G zfNlTCl>OG30;H^2{rOpa2H2CVJ`*FD(jCSKIV#<`fNxc_(lzj%ANVo=nFPKCDqmEm z3?XS_21$gW1z(A(*2WA|gD<-`WfbANjAXdqI0u_|hHELq<0i!flG$9g#Uw+#vM;#DLsE&$0*MVpHLm_~Y2(S3x3{rPd2 z$x%g1)J4X?&ScZczz(N*4-SyqKh9fqfZTD9^EQbdmgr-j-_0!kB#92IG^Ss)afnEH zC56$<-b=pif)QS{KD);|?5|JN8B(@B+XGSkag$^K{CShj#F=!r&xq?&Vsmp`pAvG` zE>F?g=htVHs-Y z0y9$r$f!&&aMI!jc$q?}<9#y@L#qTNMLO-;7pV+h2pm)yY>|%7Ttp({u)`<-S4?Eg zPh=R*bmG@};7tDjxqbbjI>yBG%gpo(ay7H|lk^K?jI>E(srp5dz-Z-*2(x(oa!?!b z6??w*N+gcDyYveKEfYs3jy|UD%>ZPMCd@!AdiIj5A@vWCGZ5ST+?Gv=yE~&-ozbs* zBS$*jAyY$Yf0@{D+V_wY6(ch$&y?73kI;!LvHk&a`yhs0ARpUHZuW7G_K{IATKkAJ z!9?p?(303*t@0vzi^ulb_*)XKYe7q*>a^}rWKDVZm|#u$o?HwRj7$BN8ou4&j zz}EnBJXL1^$=Tli0do8Jh~SgAV6+3|sMnw#(br0pmR2F+wWkEY0{5gy)TdS&1snt#MnxxOp-hIW%|cD-4qr_>7x3*JAZOrf)$_YI z+dtQvMOYWmyk|hnVuoQBss&#p4Tg}kF@prywK0Q4Rcm90slk`sn=*>FyApc`1mtgiU5Y*Idgqr&>RUx6VTK=0X(y3c*=8UM@9lN6f3NNMfR0ngoUg?u(JAQmt~7sM3Vm z1e-yAV&NE5rSNEC$t=I)QKm|J=h&5rw6=HU7<*R?cn@rlW1Y5wK!mwaR8HVCMjShYDmgX4Tw9fi;}28tAJ+&#NG4l42Z1?KiDuI z)(f--7(lZqVwjz36|uyDXrl;;+SW!9l9(e`AZ9a^u>^7Nth+uAYmU3aYuG6oYna=! zY95dX8^c}0-6?UG))W&KXp;z$y5@m5EL$2hi08{=osHBx_pVG-x0qv=F$bgElVXkn z3w@@phk3I^X=DuY3_YcSeEk7hHC8J)jn3sxt1H;lUhf}l`nfAhePo{oyn^YN-M)C> zmpi=*XTR^SCFRhms=erpUUo)5wIiG_`t1u-o_$fuyRGOE#?^@yno0UFtAdZ9><;}RrB5$y(csy8s~w{`w2P*B%<&aD&Z3EM ziRKj5BXddeqCwVCEE-&Fe$g}_H={)Z|Dsqgls4+Y$2QTTp|2!<-~5^h8+g%znl% z9#;%1v>YgbitLwjZ@+oRs%Z{BJ~F+URg0hM9wrOPsz_RK3_0hx;w0qE^Jn;Z!?pP% zarh!NPtMhH)j344wB>ASnV71STn2%}7JH||PNc*~S?sZxwE*t6#XkOGZ)`+d@EWy` z*9LL4)$V+E(cEsnJM#9WXZ1!>P9IYjOz*oV5(X0YqV?VA0+{$u>jI7m6_L=q0A*lM zVNlU|ech=A21kR6s`qA-PGF#U<-n2iKl2dddk6`?o}`7C=omKC9L!N^d|nVv9z{UH zZ($K6kE=-XXg~rDEWfnHJp8_2bw=~7)6?VwIEhu?azqjd0_Ny2W`QISbd(Jx=3dOb zjN)DlhK9}eM-w!?vsgOD2wjKuutOt%^N#1$F0N-v=|z&PxFo^yQ(O#ba}K<3VE>rV z%2u#2wn}S#Bw~nnK4Du0*XA2K_}%*gTh)6*TWK(4`AmWNjDb|Ap#y=WK#&qirBOg? zwd%Zu%|HtF-E{Ju)60IQR8pc`DMu-V4~0QjZFLz@im6=ZD5Z{wQq|%9nF1W0NvX1= zRIw-}L6RhtGA#uMBfy^K_b=g;1c*jfPHryYqy*R`?$Ym~#gwJrd8OY|3NcX&p$V~J zxA92}F@U(Tm6$=CLELx+K9E}$+b^n4Ucz)kX=D4NdCgViq@Mz;CiZKk9Ds!u!XHRL zhNANY9|*+Y3t~_DDW#Suu~A92L4Lxw&zn z(Na%uxApt&-@OEMTt*yrd5Dqx1Nn44cyFND_b4IY5Lb2OkJkg+*!;SCN~7=YsKB8% z*B^X;%d0c^XdO`lPye{B*W0`8?(rRdhwjJx0D;VXP`k!|KJ;sAi$QtgFFZj1OKs2IH~+n~Z}re^B2a)*Zg ztR@;7RQ%1x?Sm>CDj59KLPax~;yA=ZMFI{fgir{FAn`ebL-?l7F}o=ZhfV^AU~R?$F;5ed_P1nG5YN)6l0}H1s|4 zH1wPPpd+Z-UH;hqw=}#cCFjC#2A;+gSlm2{_{FvUktSdK+DTx{bYmrfG2?r+U@QT8 z!(vGH4~f|V^0$AsK!Xdwu-I+o@K}t{Jiy}ii4h)^^>zV9S!rc5aTo(}T4-*bb9vZI zssI*Nj175kmPx|0vps_$z@0;qa8@uxYCXOtv@j$Qaat4-L7Y}0iXbjWi70|NUL*1l zcLv3a!$NEHHj436!|z(za9TpE6ys{Jv>8@m2=f?sIb&D~Cw+u@Ovu5 znXEHe_vnbPdS5@(ChI~iYlfepr!oe9I?Y&gp~N&}3mq%Ax$qMlnr@nLPMQ&Y8KdfW zCmWNDO!&$}UhLw`osU#`E*wxzkD4AeJsJ;RbJU}T&A0XSo!c+b92F4gs;nYSL0>x$ zW?hx=5QJw`wqY%0J-#eX!^`fczDbEbWtxI45WQbdS*Jl!&P0~lGK~a=wD2jaFY(3V zutda%@+yw|Y1$SZO{~hDrfnJF%xl~5Q=#<=KVx_PjCG!TkPAUX+2CwJgar|g4}WbB z?7GderanDbGc1z?*37lFV2wn;^zNTFR=c8G*o$Iggb!zsbL>Y00@I=MyC7lEs}dRx|b@N30gPizKXO>a(91t7AmKlu*AK zp@a&teBn%B4&YQAfJ8`Z(;1Yb@E62d7PRG9Bb7~{p}lMaV5Aahnsl*Jq}E%2y)aTa z6;hcr^Pikm7{RZ#DiX%BR7tnLGA#JP!b?sfK(nfcN%--d_omo znpD}s8(=XvZX7fEW(CkvF|TpjB;&T%mF;A#z{7c)g>ibM(R^nJV& zRb0#zapx*w-&xVF$oT41_zG7uqxZ8d6fu#K)?i6EJgq~3dEhx3tYZ0!>MdrlBJBs* z(`JOV^DpBhRBVsz!;ya#CyxAUmvH1?{sto;4*w#~X6m)qp<&WVfbFH4odw;YzbX3E z-xUt$@=7{U#X4KYVt_wi#$u@UG#QI|DB9gOgejH?#!jw1iv)sL)3JUz>zB);Uk;Zu z1N_?UH)vJhFNhTnw4{Rf%|M*i21y`J%PAzY;AiZiXdCA6frAE#U0d7AAd)soq{2jk zX=NtHVx{%VF>{>nmy2~_fVYi(pWlqqj2G+hE`YJDQQ~A9B@nEQz}%}8s@8E&Nnj!# z$d|wv$zTK)+FO-IV2s!qgTP!-t8C9@MXggQYN1GH)`T`9G%`HU;4#ter((%aR4_9# zERqb*;9Q}@M~fD)U-$b~$0Af-bVj~FMKId=%)h7#KP$anR?2&|rsOjbzGfRwGOjc! zo@9KCHlCyh&RksU?&#u7%%{yKQTi~~))E?pwlL9SL0nos3AZ1+cv?Qms7>D9i4L?b zoOR5BnFC8d&)OI(aN-g$(GYe)G%q&wm8Y^Zy;_YS;@ZN&{MM1rCA${YA*0!R4d*b+i@5k@y_JN!kaR!fKYT{4%1DtO}zV`<%l$f{`Vgv6?(kqSJTSdmOiDk8?>NhN`?_*#>U zvC8fPH^wr?+Kbn;30SF&CBe^p8H)+m@?|XRA>V8}xwpt;r83rUi^X#Bb^(X|xi<2% z8+Qk*EBIgdLBsRz4>e8;uW@s;S$y%=ZnORUdbi!||8Mx~+=nP72BH3v4fUV?aa*sq zciY|LJ9mRi_apg03S9h(EljZNRy%*V9w`~&k$?0(zi78q(a`UyqM^Sd8vfk=G78Cy zU#4uvqc3sr{b-9W^nl|6Qp-4aZN|rFJyP?P#&29}lA7;4esL}5(A11yTWgY<*%`mM zmUAQyOly+x+VEszK1OSj=%Fwd*P5he#mDblYmy|=N5MD_=dV@>(pOc(I8WgH9qyg! z#ZyjlJ9wFt(=rgi?bPky#z2d++xxXK5Oj#_?t3a(gonmJfzB2!12rpt4h*!Mh3PQ{ zDvW_dNUk+Oba5u4*Sa8yd()aAiF?y}Adw{3njm=M;QixxsAV7#y(O6-cr;U(Ac?^` zbq1mngNk=R7!5jwir=~7+@L}nP(dl1F-*uSOwTY3nC8eZM2)~RF%~#Ah7sw70#def z=WwDSn`V2e)pcOC_(aAN9)YeuDX*DBZBRzT+17Q8ag1@^tVM6RHO2|#n3FP&$VKvn zop5Izsw!rKk>Z?%Lq(%m@R>z+7TNv%-U#AOi|ky5*uPAp(Mxe(lB1y#fvp*;Kt)Px zs6-X10_g1w&rXQsfzOyRM4}cbD6c%St#Xat=6B>|MS(T$46eX})Jt%?|ZJIHq9u#?NKFtu}A?!k9^x#)T z7+}+brU!qFgX_rwpu}uz@lWJgwSrIrt~~yQOJ^ch;J2ZPz}1P(S)i+lO%t2GN>pt4 zIH}s45^P17l_S`Sl}#N8otGEV!;&2;Pz{dqmmy;$uo z{dqk!yeMeDPxB9mJ=#`;(*oXRa~_c}e-DIm3M? z;OC0_A~T+V`;!&&L47Y>{Xt_>9ZuGD>*{p9Y`o zNu*)9(pa6i7Sh%M$yLR#t%Xk!Hf!-yg3Vg^6k)R#K1JBf<7arTNU)ig&2ntENW<;l zwn(FXs8=k!3184Pa?9?d(w4sYID95RGar&`u~~xqT4)yGz80G$xUYp~5$d5v^GeIgunzVTe}CDH6WYQi_PLw3H%Zah_8m z>j%Em*c|7N=MeFgYuA|rTbaoTuvI}OM*`~!by+@OO5!7i>$H4C2*yBfj(j9SN8_Ww zHvBX`V*FD9d_<{R^aS2A?2{2v*@R@NTj+dVW2DeiK^Mc&^ocs8@)^nXW^r2@$|wX` zq8Ff~w&X<-O2JzijIct`Tr6QtAL&6L84H-SghfxBxwtmd6B$dbhr$ekiQRY$C5qh| zQkj9zF;X#7QI?5BO{mD z5V*vo>&X?0B-Jx*ok--Mw1fm-C%a`*TPG4huh!*AB0nu5NtzY=s-?BA}kEr9t8|cfhidM%9SVe0NdDy;L-`3Z6Z?=D~ zH;eFkBQh-Tlw%1QYKc}vhFYeTkfD}nMU5v~rnT5qYu&X>i*uZ5YT=1iOoq-%#ts2B zGNeCA}5%}jlc!5*i*YvknViSO!Za##|p8QV3K1` zXbB@=>A=j-ol${@s1#KC7sZ$s0)&9wLI4W^))oSMnleDaAwd1%?ZUkp(7kZBKEH7? zdv#US=k* zgS4h#9{;p`~ z?}~>0u4w4*iiZBKaQMRC6@u3be^+=Y1+tP?j;(`HvZGQtj7WgY-cdiXL znq?EexVEXWS-|m&^BoR@EohD`{I+t(SI)1-=t}c$^dr;Nzgna=x&qtXEU5OG5aoDc zprI!Q8hT=&p(h3!9*?e!ftr_}09C{g>*N?ngyTsJbjh?dY8eQZ5JzuX1`-jomVxNv zj9%dxD6HW`^v37wae9MOQ&tPda7c0ch73Px)|l3l;WdtG%Z%sti?YqiGM;PRm(@&X znwCYFEr}u0QXQce{EYFa)@?~#qSkDQkTH$wVCBS~ik9j`grTK6k-^nc9bG(`>V#}G z@LPR0(pWBIBbd}wbOe&{{>DPWYIn(mgj2dMVcI& zjp&gxRHtPl5!I!!5pYtBq_xINM0KMXFM5f>UL~wgbw4=y{*lEnqdKEHVESeaw*HTX zMs<=m{!MATCZ#&Oai&(JWjeZX_#0xdtED;#jcS=rWR2384i;DJsc4xFpUOl;ClVJt z(S_qnk+{$v$te;S+A}$6vM}GLb{CIGT=2ViWIh^fw0o^Y@#5ln%V4+~ymKYQ=zQku za`>4q%i+j3k>JSpkRV|C781h!_L>Bydn0SM@lUE*gYCniAtNr@UnbX%$J`;U%aaxtMYlU}#3GV$_>h-J1yMZ}xhO-dB%#X(#fsZc~1zqYnf z0X+xflC{+v5wmF%6_Wn&GgbuAmTyF2LR-EO)rPc6MHKh*jS8Z;-)*by?^EhCTN!W6qeGM4fria_&l?PGZhhS z@+MqNn@}a6L7CiW(`eIZb6&KGW^vWC9) zWRpw(m>Ps{pC}FegQDS>4Snrjq=~NE{}%j8wR`c4mE5u9Uaa4rQn-ki?nH$PEb7vP zOS9tRH*Ufus4Yt94Sa&z`8&sb_UjVgI`q$elcViJm}kIZT0_1CKgZgFNF-@%3yp@3 zvr~>@gPsc0OrT9kM0tQVC80;o6fV+g3rq&1RU4wh1s{`03m5D;Ic4yw6)xtwkLJ@>_OvM9MQP7Ye#GB}y{y5AMyb&ZpalYN1ufi2_Y=`(pURui3rhoL;< ztF%5@WH+@wSw#5Sb^P63a#8{VuD~U%UCFrA=H3qQ*h=Jia3iN8Z=sXzJ5l;1P}Xt`P|UB<-1o zg&J!gN~-?CI*nbM-!nZ>y+Q;$ziz4vdrae$`)vi;C2+k4hB}<6ceYI;1Lx;9jIy;L!3k% z(`QU>o(B|DffK!a>L_K}&pzy@&a_u5rX%CKO?y*J*LQEWe;%xCOT708ipf>w zoF??;BlJWc2cCFGxjbB^a#L`Uk_{FtNEX}fsUtHV7LJK{9ZN7(_$a zk;o2T%ZJxzEu!IC$P8ZFd&3r@i>Y7hHzwMOCb|}GiJ9UPT?@AonlQK>h9X+HmC&0O PZJT8ipC}ok?aluKDCW#G diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index 6174ea4e5..d86ae1ac6 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -74,7 +74,7 @@ %% Step 3. %% Fill the gaps with additional nodes (part of the small ring) if needed %% to get to full ring size. -%% While n-val not reached (zero_violations is false): +%% While n-val not reached (zero violations is false): %% swap nodes (exchange position) or move nodes %% (moving vnode I to before vnode J). %% @@ -788,14 +788,14 @@ prop_no_locations() -> end). config_gen() -> - ?LET(N, choose(1,7), ?LET(M, choose(2, 6), vector(N, choose(M, M + 2)))). + ?LET(N, choose(1,7), ?LET(M, choose(2, 6), vector(N, choose(M, M + 2)))). prop_brute_force_optimize() -> in_parallel( ?FORALL({Size, Config, NValsMap}, {elements([128, 256, 512]), config_gen(), ?LET(N, choose(3, 4), #{node => N, location => default(N, choose(2, N))})}, - ?IMPLIES(length(Config) >= maps:get(location, NValsMap), + ?IMPLIES(length(Config) >= maps:get(location, NValsMap), begin NVals = to_nvals(NValsMap), {T1, Ring1} = timer:tc(fun() -> solve(Size, Config, NValsMap, [no_brute_force]) end), diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index b33220a6b..d4be6498f 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -295,9 +295,10 @@ necessary_conditions(S) -> sufficient_conditions(S) -> Locations = to_config(S), - length(Locations) >= S#state.nval + 2 - andalso length(S#state.nodes) < S#state.ring_size div 2 - andalso lists:min(Locations) >= lists:max(Locations) - 2. + known_solution(S#state.ring_size, Locations, S#state.nval) orelse + (length(Locations) >= S#state.nval + 2 + andalso length(S#state.nodes) < S#state.ring_size div 2 + andalso lists:min(Locations) >= lists:max(Locations) - 1). to_config(S) -> LocNodes = @@ -524,3 +525,381 @@ prop_translate() -> {Idx2, _} <- [lists:keyfind({L,N}, 2, NewLocRel)], Idx1 == Idx2], StayTheSame) end))). + +known_solution(Size, Config, NVal) -> + lists:member({Size, Config, #{node => NVal, location => NVal}}, solution_list()). + +solution_list() -> + [{64,[1,1,2,3],#{location => 2,node => 2}}, + {64,[1,1,3,2,2,3,3],#{location => 2,node => 2}}, + {64,[1,1,3,3,2,3,1],#{location => 4,node => 4}}, + {64,[1,1,4,5,5],#{location => 2,node => 2}}, + {64,[1,2,1,1,1,2,3],#{location => 2,node => 2}}, + {64,[1,2,1,2,3,1,2],#{location => 3,node => 3}}, + {64,[1,2,1,3,2,3],#{location => 4,node => 4}}, + {64,[1,2,2,4,4,2,2],#{location => 3,node => 3}}, + {64,[1,2,3,3],#{location => 3,node => 3}}, + {64,[1,2,4,4,3,6,5],#{location => 3,node => 3}}, + {64,[1,3,1,1,1,1,3],#{location => 3,node => 3}}, + {64,[1,3,2],#{location => 2,node => 2}}, + {64,[1,3,2,2,1,3,2],#{location => 3,node => 3}}, + {64,[1,3,2,2,5,6],#{location => 3,node => 3}}, + {64,[1,3,3,1,1,3,1],#{location => 2,node => 2}}, + {64,[1,3,4,2,3,2,1],#{location => 2,node => 2}}, + {64,[1,4,2,4],#{location => 2,node => 2}}, + {64,[1,4,3],#{location => 2,node => 2}}, + {64,[1,4,3,1,4,1],#{location => 2,node => 2}}, + {64,[1,4,4,1,4,3],#{location => 2,node => 2}}, + {64,[1,5,3,5],#{location => 2,node => 2}}, + {64,[1,5,4,3,6],#{location => 2,node => 2}}, + {64,[2,1,3],#{location => 2,node => 2}}, + {64,[2,1,3,5,5,2,3],#{location => 2,node => 2}}, + {64,[2,2,4,3,6],#{location => 2,node => 2}}, + {64,[2,3,1,1,1,2,2],#{location => 2,node => 2}}, + {64,[2,4,1,4,1],#{location => 3,node => 3}}, + {64,[2,4,1,5,2,5],#{location => 3,node => 3}}, + {64,[2,4,2,4,6],#{location => 2,node => 2}}, + {64,[2,4,3,4,3],#{location => 3,node => 3}}, + {64,[2,4,3,5],#{location => 2,node => 2}}, + {64,[2,5,2,2,5,4],#{location => 2,node => 2}}, + {64,[2,5,2,3,4,5],#{location => 2,node => 2}}, + {64,[2,5,3,2,1],#{location => 2,node => 2}}, + {64,[2,5,3,2,3,3],#{location => 3,node => 3}}, + {64,[2,5,4,5,1,4],#{location => 3,node => 3}}, + {64,[3,1,2,1,3,2],#{location => 3,node => 3}}, + {64,[3,1,3,3,1,1,3],#{location => 2,node => 2}}, + {64,[3,1,3,3,3,3,2],#{location => 3,node => 3}}, + {64,[3,2,1,2,1,1],#{location => 2,node => 2}}, + {64,[3,2,1,3,3,3],#{location => 2,node => 2}}, + {64,[3,2,2,5,3],#{location => 2,node => 2}}, + {64,[3,2,3,4,4,4,2],#{location => 4,node => 4}}, + {64,[3,3,1,2,1,2],#{location => 3,node => 3}}, + {64,[3,3,2,2,2,1,3],#{location => 3,node => 3}}, + {64,[3,3,3,1,3,1,1],#{location => 3,node => 3}}, + {64,[3,3,4,2,3,2],#{location => 2,node => 2}}, + {64,[3,3,5,4,3],#{location => 3,node => 3}}, + {64,[3,4,1],#{location => 2,node => 2}}, + {64,[3,4,1,3,2,1,2],#{location => 4,node => 4}}, + {64,[3,4,2,3,3],#{location => 2,node => 2}}, + {64,[3,4,4,1,4],#{location => 4,node => 4}}, + {64,[3,4,4,2,1,2,2],#{location => 2,node => 2}}, + {64,[3,4,4,4,5,4],#{location => 2,node => 2}}, + {64,[4,1,1,2,3,4,3],#{location => 4,node => 4}}, + {64,[4,1,4,3],#{location => 2,node => 2}}, + {64,[4,2,3,4,2,2],#{location => 3,node => 3}}, + {64,[4,2,5],#{location => 2,node => 2}}, + {64,[4,3,1,3,4,2,1],#{location => 3,node => 3}}, + {64,[4,3,1,4,2,4],#{location => 2,node => 2}}, + {64,[4,3,3,2,1],#{location => 2,node => 2}}, + {64,[4,3,3,5,2,3,5],#{location => 3,node => 3}}, + {64,[4,4,2,1],#{location => 2,node => 2}}, + {64,[4,4,2,4,2],#{location => 2,node => 2}}, + {64,[4,4,3,1,2],#{location => 2,node => 2}}, + {64,[4,5,3],#{location => 2,node => 2}}, + {64,[5,1,6],#{location => 2,node => 2}}, + {64,[5,3,3,3,4,4],#{location => 2,node => 2}}, + {64,[5,3,3,5,1,3,4],#{location => 3,node => 3}}, + {64,[5,4,5,2,3,6],#{location => 3,node => 3}}, + {64,[5,5,2],#{location => 2,node => 2}}, + {64,[5,5,2,1,5,5],#{location => 2,node => 2}}, + {64,[5,5,6,5,2,1],#{location => 2,node => 2}}, + {64,[6,2,4,3],#{location => 2,node => 2}}, + {64,[6,6,3,4,2,5,4],#{location => 4,node => 4}}, + {128,[1,1,1,1,2,2,3],#{location => 3,node => 3}}, + {128,[1,1,1,3,1,1,2],#{location => 2,node => 2}}, + {128,[1,1,2,3,1],#{location => 2,node => 2}}, + {128,[1,1,3,1,3],#{location => 2,node => 2}}, + {128,[1,1,3,2,3,3],#{location => 2,node => 2}}, + {128,[1,2,2,1,4,2],#{location => 2,node => 2}}, + {128,[1,2,2,2,3,2],#{location => 3,node => 3}}, + {128,[1,2,2,5,4,1],#{location => 2,node => 2}}, + {128,[1,2,3],#{location => 2,node => 2}}, + {128,[1,2,3,1,1,3,5],#{location => 2,node => 2}}, + {128,[1,2,3,3,3],#{location => 2,node => 2}}, + {128,[1,2,4,1],#{location => 2,node => 2}}, + {128,[1,2,4,4],#{location => 2,node => 2}}, + {128,[1,3,1,1],#{location => 2,node => 2}}, + {128,[1,3,3,1,2,1],#{location => 2,node => 2}}, + {128,[1,3,3,1,2,3,3],#{location => 3,node => 3}}, + {128,[1,4,1,3,2,2],#{location => 2,node => 2}}, + {128,[1,4,2,2,3,1,2],#{location => 2,node => 2}}, + {128,[1,4,4,2,2,6],#{location => 3,node => 3}}, + {128,[1,4,6,1,4],#{location => 2,node => 2}}, + {128,[1,5,1,4,2,6],#{location => 2,node => 2}}, + {128,[1,5,4,1,5],#{location => 3,node => 3}}, + {128,[2,1,1,3,1,2,2],#{location => 4,node => 4}}, + {128,[2,1,3,4],#{location => 2,node => 2}}, + {128,[2,1,4,2,3],#{location => 2,node => 2}}, + {128,[2,1,5,1,3,3],#{location => 2,node => 2}}, + {128,[2,1,5,3],#{location => 2,node => 2}}, + {128,[2,2,3,1],#{location => 2,node => 2}}, + {128,[2,2,3,1,2],#{location => 2,node => 2}}, + {128,[2,2,3,2,2,5,6],#{location => 3,node => 3}}, + {128,[2,2,3,4,3,3,2],#{location => 3,node => 3}}, + {128,[2,2,3,4,5,4],#{location => 3,node => 3}}, + {128,[2,2,5,1,3,2,4],#{location => 2,node => 2}}, + {128,[2,3,1,3,1,3,3],#{location => 3,node => 3}}, + {128,[2,3,1,3,2,3,1],#{location => 2,node => 2}}, + {128,[2,3,2,2,3,1],#{location => 4,node => 4}}, + {128,[2,3,2,4,2,3,4],#{location => 2,node => 2}}, + {128,[2,3,3,1,1],#{location => 2,node => 2}}, + {128,[2,3,3,1,2,3,3],#{location => 2,node => 2}}, + {128,[2,3,3,2,4,2],#{location => 2,node => 2}}, + {128,[2,3,3,4,2,1],#{location => 3,node => 3}}, + {128,[2,3,3,4,3],#{location => 3,node => 3}}, + {128,[2,3,4,2,1],#{location => 2,node => 2}}, + {128,[2,3,4,2,3],#{location => 3,node => 3}}, + {128,[2,3,5,1,3,3],#{location => 3,node => 3}}, + {128,[2,4,1,1],#{location => 2,node => 2}}, + {128,[2,4,3,1,3],#{location => 3,node => 3}}, + {128,[2,4,3,1,5,3],#{location => 2,node => 2}}, + {128,[2,4,4,4],#{location => 2,node => 2}}, + {128,[2,4,4,4,1,2,4],#{location => 3,node => 3}}, + {128,[2,4,4,4,5,2],#{location => 4,node => 4}}, + {128,[2,5,1,4,5,4,3],#{location => 4,node => 4}}, + {128,[2,5,2,4,4,4],#{location => 3,node => 3}}, + {128,[3,1,3,2,4,2,1],#{location => 4,node => 4}}, + {128,[3,1,3,3,3],#{location => 3,node => 3}}, + {128,[3,1,4,1,2,2,3],#{location => 2,node => 2}}, + {128,[3,2,1,3],#{location => 2,node => 2}}, + {128,[3,2,2,1,2,2,1],#{location => 2,node => 2}}, + {128,[3,2,3,1,1],#{location => 3,node => 3}}, + {128,[3,2,4,4,5,5],#{location => 2,node => 2}}, + {128,[3,3,1,1,2,1],#{location => 2,node => 2}}, + {128,[3,3,1,3],#{location => 2,node => 2}}, + {128,[3,3,2,2,5,5,2],#{location => 2,node => 2}}, + {128,[3,3,3,6,2,4,2],#{location => 4,node => 4}}, + {128,[3,3,4,2],#{location => 2,node => 2}}, + {128,[3,3,4,2],#{location => 3,node => 3}}, + {128,[3,3,5,1],#{location => 2,node => 2}}, + {128,[3,4,1,4,4],#{location => 4,node => 4}}, + {128,[3,4,3,4,2,1,4],#{location => 2,node => 2}}, + {128,[3,4,4,2,2,2],#{location => 3,node => 3}}, + {128,[3,5,3,2,2,1,2],#{location => 2,node => 2}}, + {128,[3,5,4,5,1,3],#{location => 2,node => 2}}, + {128,[4,1,4,2,1],#{location => 3,node => 3}}, + {128,[4,1,5,3],#{location => 2,node => 2}}, + {128,[4,2,1,3,4,4,3],#{location => 2,node => 2}}, + {128,[4,2,2,1,2,4],#{location => 2,node => 2}}, + {128,[4,2,2,3],#{location => 2,node => 2}}, + {128,[4,2,3,2,1,2,4],#{location => 3,node => 3}}, + {128,[4,2,4,4],#{location => 2,node => 2}}, + {128,[4,3,2,3,1],#{location => 2,node => 2}}, + {128,[4,4,1,1,4,2,4],#{location => 2,node => 2}}, + {128,[4,4,2,5],#{location => 2,node => 2}}, + {128,[4,4,4,3,5,2,3],#{location => 3,node => 3}}, + {128,[4,5,1,4,5,2,3],#{location => 2,node => 2}}, + {128,[4,5,2,3,2,4,2],#{location => 4,node => 4}}, + {128,[4,5,3,5],#{location => 3,node => 3}}, + {128,[4,6,6,2],#{location => 2,node => 2}}, + {128,[5,1,5,5,2,3],#{location => 4,node => 4}}, + {128,[5,2,1,2,2,1,4],#{location => 2,node => 2}}, + {128,[5,2,1,3,4],#{location => 2,node => 2}}, + {128,[5,2,4,3,5,3,5],#{location => 4,node => 4}}, + {128,[5,2,6,6,1,5,5],#{location => 2,node => 2}}, + {128,[5,3,4,4,2,5,1],#{location => 3,node => 3}}, + {128,[5,5,1,3],#{location => 2,node => 2}}, + {128,[5,5,1,4],#{location => 2,node => 2}}, + {128,[5,5,4,6,2,5],#{location => 2,node => 2}}, + {128,[5,6,1],#{location => 2,node => 2}}, + {128,[5,6,4,1,6,3,2],#{location => 2,node => 2}}, + {128,[5,6,4,6,6],#{location => 3,node => 3}}, + {128,[6,1,1,5,3,6],#{location => 2,node => 2}}, + {128,[6,1,2,4,5,2],#{location => 2,node => 2}}, + {128,[6,2,1,1,5,5,3],#{location => 3,node => 3}}, + {128,[6,2,5,2,6,6,2],#{location => 4,node => 4}}, + {256,[1,1,1,2,3,2,3],#{location => 3,node => 3}}, + {256,[1,1,3,2,3,3],#{location => 3,node => 3}}, + {256,[1,1,4,4,2,2],#{location => 3,node => 3}}, + {256,[1,2,3],#{location => 2,node => 2}}, + {256,[1,2,3,2,3,1,1],#{location => 2,node => 2}}, + {256,[1,3,2,3,4,3],#{location => 2,node => 2}}, + {256,[1,3,3,3,4,4],#{location => 2,node => 2}}, + {256,[1,3,5,3,1,3],#{location => 3,node => 3}}, + {256,[1,4,2,1],#{location => 2,node => 2}}, + {256,[1,4,3,3,4],#{location => 3,node => 3}}, + {256,[1,4,4,3,2,2,3],#{location => 3,node => 3}}, + {256,[1,4,4,4,3],#{location => 4,node => 4}}, + {256,[2,1,1,2,3,1,1],#{location => 2,node => 2}}, + {256,[2,1,3,1],#{location => 2,node => 2}}, + {256,[2,1,4,1,3,6,4],#{location => 2,node => 2}}, + {256,[2,1,5,4,3,5],#{location => 2,node => 2}}, + {256,[2,2,5,4,3],#{location => 3,node => 3}}, + {256,[2,3,1,3],#{location => 3,node => 3}}, + {256,[2,3,5,2,6,2],#{location => 3,node => 3}}, + {256,[2,3,5,4,5],#{location => 2,node => 2}}, + {256,[2,4,1,5,1],#{location => 2,node => 2}}, + {256,[2,4,2,4,4,2],#{location => 4,node => 4}}, + {256,[2,6,3,4,5,1],#{location => 2,node => 2}}, + {256,[2,6,5,5,5,1,2],#{location => 2,node => 2}}, + {256,[3,1,2,1,3,1,1],#{location => 2,node => 2}}, + {256,[3,1,2,1,5,3],#{location => 2,node => 2}}, + {256,[3,1,2,2,5,3,6],#{location => 2,node => 2}}, + {256,[3,1,2,4,1,4],#{location => 2,node => 2}}, + {256,[3,1,3,1],#{location => 2,node => 2}}, + {256,[3,1,3,2,2,2,2],#{location => 2,node => 2}}, + {256,[3,1,4,4],#{location => 3,node => 3}}, + {256,[3,2,1,3],#{location => 2,node => 2}}, + {256,[3,2,3,3,1,3,3],#{location => 3,node => 3}}, + {256,[3,2,4,3,4,1],#{location => 2,node => 2}}, + {256,[3,2,4,4,5,4],#{location => 3,node => 3}}, + {256,[3,2,5],#{location => 2,node => 2}}, + {256,[3,3,1,3,3],#{location => 2,node => 2}}, + {256,[3,3,4,2],#{location => 2,node => 2}}, + {256,[3,3,4,4,1],#{location => 2,node => 2}}, + {256,[3,3,5,4,3,3,4],#{location => 2,node => 2}}, + {256,[3,3,6,3,3,5],#{location => 3,node => 3}}, + {256,[3,4,2,2,4,3],#{location => 4,node => 4}}, + {256,[3,5,1,3,3,5],#{location => 3,node => 3}}, + {256,[3,5,1,6,5],#{location => 2,node => 2}}, + {256,[3,5,3,4],#{location => 3,node => 3}}, + {256,[3,5,4],#{location => 2,node => 2}}, + {256,[4,1,1,3,4,5],#{location => 3,node => 3}}, + {256,[4,1,2,2],#{location => 2,node => 2}}, + {256,[4,1,2,3,1,2,3],#{location => 2,node => 2}}, + {256,[4,1,2,3,4,1],#{location => 2,node => 2}}, + {256,[4,1,3],#{location => 2,node => 2}}, + {256,[4,1,3,4,1,5],#{location => 3,node => 3}}, + {256,[4,1,4,1],#{location => 2,node => 2}}, + {256,[4,1,4,1,2,2],#{location => 2,node => 2}}, + {256,[4,1,5,2,3,2,2],#{location => 2,node => 2}}, + {256,[4,2,1,2,3,3,6],#{location => 3,node => 3}}, + {256,[4,2,1,5,2,3,2],#{location => 3,node => 3}}, + {256,[4,2,3,2],#{location => 2,node => 2}}, + {256,[4,2,3,2,2,3],#{location => 2,node => 2}}, + {256,[4,2,3,2,4,2,3],#{location => 3,node => 3}}, + {256,[4,2,4,2,2,2],#{location => 4,node => 4}}, + {256,[4,2,5,3,6,6,1],#{location => 4,node => 4}}, + {256,[4,3,2,3,2,2],#{location => 4,node => 4}}, + {256,[4,3,2,5,1,5,5],#{location => 3,node => 3}}, + {256,[4,3,3,1,1,3],#{location => 2,node => 2}}, + {256,[4,3,4,3,4,1,1],#{location => 3,node => 3}}, + {256,[4,3,4,4,3,5,6],#{location => 4,node => 4}}, + {256,[4,4,1,2,1,3,3],#{location => 4,node => 4}}, + {256,[4,4,1,3],#{location => 3,node => 3}}, + {256,[4,4,2,3],#{location => 3,node => 3}}, + {256,[4,5,4,1],#{location => 2,node => 2}}, + {256,[4,5,4,6,3,3,2],#{location => 2,node => 2}}, + {256,[5,1,5],#{location => 2,node => 2}}, + {256,[5,2,1,3],#{location => 2,node => 2}}, + {256,[5,2,5,2,6],#{location => 2,node => 2}}, + {256,[5,2,5,3,3,3,3],#{location => 2,node => 2}}, + {256,[5,2,6,4],#{location => 2,node => 2}}, + {256,[5,3,3,2,1,5,6],#{location => 2,node => 2}}, + {256,[5,4,3,4],#{location => 3,node => 3}}, + {256,[5,4,4,5,5,2],#{location => 3,node => 3}}, + {256,[5,5,4,1],#{location => 3,node => 3}}, + {256,[5,6,1,4,6,4,1],#{location => 4,node => 4}}, + {256,[6,1,1,3,3],#{location => 2,node => 2}}, + {256,[6,2,5,3,1,6,1],#{location => 3,node => 3}}, + {256,[6,3,1,3],#{location => 2,node => 2}}, + {256,[6,3,3,6,6,5,4],#{location => 2,node => 2}}, + {256,[6,4,3,1,4],#{location => 2,node => 2}}, + {256,[6,6,2,3,3,6],#{location => 3,node => 3}}, + {256,[6,6,3,1,5,2],#{location => 2,node => 2}}, + {256,[6,6,4,5],#{location => 2,node => 2}}, + {256,[6,6,5,2,2,6],#{location => 2,node => 2}}, + {256,[6,6,6,3,5,6],#{location => 2,node => 2}}, + {512,[1,1,2,1,4,1,4],#{location => 2,node => 2}}, + {512,[1,1,2,3,3,2,2],#{location => 2,node => 2}}, + {512,[1,1,3,2,1],#{location => 2,node => 2}}, + {512,[1,1,3,2,2,4,3],#{location => 3,node => 3}}, + {512,[1,2,1,2,2,3],#{location => 3,node => 3}}, + {512,[1,2,3],#{location => 2,node => 2}}, + {512,[1,2,3,3,1],#{location => 3,node => 3}}, + {512,[1,3,2,2,3,1,3],#{location => 4,node => 4}}, + {512,[1,3,4],#{location => 2,node => 2}}, + {512,[1,3,4,3,5],#{location => 3,node => 3}}, + {512,[1,4,2,3],#{location => 2,node => 2}}, + {512,[1,4,2,3,1,3,2],#{location => 4,node => 4}}, + {512,[1,4,4],#{location => 2,node => 2}}, + {512,[1,4,4,2,4],#{location => 3,node => 3}}, + {512,[1,4,4,4],#{location => 2,node => 2}}, + {512,[1,4,5,2,4,1,4],#{location => 3,node => 3}}, + {512,[1,5,2,1,2,5,2],#{location => 2,node => 2}}, + {512,[1,5,3,3,4,1,3],#{location => 4,node => 4}}, + {512,[1,5,4],#{location => 2,node => 2}}, + {512,[1,5,5],#{location => 2,node => 2}}, + {512,[1,6,2,4,2,4],#{location => 3,node => 3}}, + {512,[2,1,2,3,3,1,1],#{location => 3,node => 3}}, + {512,[2,1,3,2,3,3,4],#{location => 2,node => 2}}, + {512,[2,1,4,2,3,1,1],#{location => 3,node => 3}}, + {512,[2,1,4,3,2],#{location => 2,node => 2}}, + {512,[2,1,4,4,1,3],#{location => 2,node => 2}}, + {512,[2,1,4,4,4,4,2],#{location => 2,node => 2}}, + {512,[2,2,3,2,1,4,1],#{location => 2,node => 2}}, + {512,[2,3,1,4,2,1,6],#{location => 2,node => 2}}, + {512,[2,3,2,1],#{location => 2,node => 2}}, + {512,[2,3,2,4,4,2,5],#{location => 3,node => 3}}, + {512,[2,3,3,2,1],#{location => 2,node => 2}}, + {512,[2,3,3,2,3,1],#{location => 4,node => 4}}, + {512,[2,4,1,2,2],#{location => 2,node => 2}}, + {512,[2,4,2,4],#{location => 3,node => 3}}, + {512,[2,4,2,4,2],#{location => 3,node => 3}}, + {512,[2,4,4,1,4,4,2],#{location => 3,node => 3}}, + {512,[2,5,1,5,2],#{location => 3,node => 3}}, + {512,[2,5,5,4,5,4],#{location => 4,node => 4}}, + {512,[3,1,1,1,3,2],#{location => 3,node => 3}}, + {512,[3,1,1,2,3,1,1],#{location => 3,node => 3}}, + {512,[3,1,1,2,3,3,3],#{location => 4,node => 4}}, + {512,[3,1,1,3,3,1],#{location => 3,node => 3}}, + {512,[3,1,1,4,4,3],#{location => 3,node => 3}}, + {512,[3,1,3,3,2,1],#{location => 3,node => 3}}, + {512,[3,1,3,5],#{location => 2,node => 2}}, + {512,[3,2,1,1],#{location => 2,node => 2}}, + {512,[3,2,2,1,3,3],#{location => 2,node => 2}}, + {512,[3,2,2,1,3,4],#{location => 3,node => 3}}, + {512,[3,2,2,2,3,1],#{location => 2,node => 2}}, + {512,[3,2,4,1,2],#{location => 2,node => 2}}, + {512,[3,2,4,5,3],#{location => 2,node => 2}}, + {512,[3,2,5,4,3,2],#{location => 3,node => 3}}, + {512,[3,3,2,2,1,1],#{location => 3,node => 3}}, + {512,[3,3,3,1,2],#{location => 2,node => 2}}, + {512,[3,3,5,4,1,1,4],#{location => 2,node => 2}}, + {512,[3,4,2,2],#{location => 2,node => 2}}, + {512,[3,4,2,3],#{location => 2,node => 2}}, + {512,[3,4,3,1,6,3,6],#{location => 3,node => 3}}, + {512,[3,4,3,4,2,4],#{location => 2,node => 2}}, + {512,[3,4,4,1],#{location => 3,node => 3}}, + {512,[3,4,4,2],#{location => 3,node => 3}}, + {512,[3,4,5,5],#{location => 2,node => 2}}, + {512,[3,5,4,1,1],#{location => 2,node => 2}}, + {512,[3,5,4,1,5],#{location => 3,node => 3}}, + {512,[3,6,2,2,5],#{location => 3,node => 3}}, + {512,[4,1,2,3,2,2],#{location => 2,node => 2}}, + {512,[4,1,2,4,1,4],#{location => 3,node => 3}}, + {512,[4,1,2,4,5,3,4],#{location => 4,node => 4}}, + {512,[4,1,4,2,2,2],#{location => 3,node => 3}}, + {512,[4,1,4,2,5,4],#{location => 3,node => 3}}, + {512,[4,2,1,4,4],#{location => 2,node => 2}}, + {512,[4,2,3,2,6],#{location => 2,node => 2}}, + {512,[4,2,3,4,6],#{location => 3,node => 3}}, + {512,[4,3,2],#{location => 2,node => 2}}, + {512,[4,3,2,2,1,4],#{location => 4,node => 4}}, + {512,[4,3,2,4,3,3,3],#{location => 3,node => 3}}, + {512,[4,3,3,1,2],#{location => 2,node => 2}}, + {512,[4,3,5,4,4,5],#{location => 4,node => 4}}, + {512,[4,4,1,5,2],#{location => 3,node => 3}}, + {512,[4,4,1,6,6,3,3],#{location => 2,node => 2}}, + {512,[4,4,2,4,2],#{location => 3,node => 3}}, + {512,[4,4,2,4,4,1],#{location => 3,node => 3}}, + {512,[4,4,4,2,3],#{location => 2,node => 2}}, + {512,[4,4,5,3,5,4],#{location => 4,node => 4}}, + {512,[4,4,5,5,3,2,1],#{location => 2,node => 2}}, + {512,[4,5,3],#{location => 2,node => 2}}, + {512,[4,5,4,4,6,5],#{location => 3,node => 3}}, + {512,[4,5,5,6,6],#{location => 3,node => 3}}, + {512,[4,6,5],#{location => 2,node => 2}}, + {512,[5,1,2,1,2,2],#{location => 2,node => 2}}, + {512,[5,1,4,4],#{location => 2,node => 2}}, + {512,[5,2,2,4,3,1],#{location => 2,node => 2}}, + {512,[5,2,3],#{location => 2,node => 2}}, + {512,[5,2,4,1],#{location => 2,node => 2}}, + {512,[5,2,4,3,1],#{location => 3,node => 3}}, + {512,[5,2,5,5,3,1,1],#{location => 2,node => 2}}, + {512,[5,3,3,5],#{location => 2,node => 2}}, + {512,[5,3,6,5],#{location => 3,node => 3}}, + {512,[5,5,1,3,4],#{location => 3,node => 3}}, + {512,[5,5,4,4,5,1],#{location => 2,node => 2}}, + {512,[5,6,4,3],#{location => 3,node => 3}}, + {512,[6,5,4,4],#{location => 2,node => 2}}]. From c9ca336d0f9a266ff7c5729b29d84ca57b3d9ed9 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 23 May 2023 12:38:06 +0100 Subject: [PATCH 23/30] Add full-rebalance for v4 The leave call on a failure of simple_transfer will call sequential_claim - which is part of the v2 claim family. Now we have v4, if this is configured it should call v4 as it does handle leaves. --- src/riak_core_claim_binring_alg.erl | 17 ++++++++++------- src/riak_core_membership_claim.erl | 17 ++++++++++++++++- src/riak_core_membership_leave.erl | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index 6174ea4e5..f6319fcd5 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -434,11 +434,14 @@ worth_brute_force(RingSize, V) -> true -> no_brute_force end. -maybe_brute_force(Ring, NVals) -> +maybe_brute_force(Ring, NVals, Options) -> case worth_brute_force(ring_size(Ring), violations(Ring, NVals)) of - brute_force -> brute_force(Ring, NVals); - node_only -> brute_force(Ring, NVals, [node_only]); - no_brute_force -> Ring + brute_force -> + brute_force(Ring, NVals, Options); + node_only -> + brute_force(Ring, NVals, [node_only|Options]); + no_brute_force -> + Ring end. @@ -472,7 +475,7 @@ solve(RingSize, Config, NValsMap, Options) -> %% Should not ask for this case if NoBruteForce -> BigRingD; AlwaysBruteForce -> brute_force(BigRingD, NVals); - true -> maybe_brute_force(BigRingD, NVals) + true -> maybe_brute_force(BigRingD, NVals, []) end; _ -> BigRingI = solve_node_insertions(Cycle(Rounds), NVals, Extras), @@ -488,7 +491,7 @@ solve(RingSize, Config, NValsMap, Options) -> end, if NoBruteForce -> BFRing; AlwaysBruteForce -> brute_force(BigRingD, NVals); - true -> maybe_brute_force(BFRing, NVals) + true -> maybe_brute_force(BFRing, NVals, []) end end. @@ -571,7 +574,7 @@ update(OldRing, Config, NValsMap) -> ToRemove = OldNodes -- NewNodes, %% Swap in new nodes for old nodes (in a moderately clever way) NewRing = swap_in_nodes(OldRing, ToAdd, ToRemove, NVals), - maybe_brute_force(NewRing, NVals). + maybe_brute_force(NewRing, NVals, [{only_swap, true}]). swap_in_nodes(Ring, [], [], _NVals) -> Ring; swap_in_nodes(Ring, [New | ToAdd], ToRemove, NVals) -> diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 34195fc79..3af0e8785 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -52,7 +52,7 @@ %% default is 4. -module(riak_core_membership_claim). --export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4]). +-export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4, full_rebalance/2]). -export([default_wants_claim/1, default_wants_claim/2, default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, default_choose_params/0, default_choose_params/1]). @@ -112,6 +112,21 @@ claim(Ring) -> end, claim(Ring, Want, Choose). +-spec full_rebalance( + riak_core_ring:riak_core_ring(), node()) -> riak_core_ring:riak_core_ring(). +full_rebalance(Ring, HeadNode) -> + case app_helper:get_env(riak_core, choose_claim_fun) of + choose_claim_v2 -> + sequential_claim(Ring, HeadNode); + choose_claim_v3 -> + sequential_claim(Ring, HeadNode); + choose_claim_v4 -> + riak_core_claim_swapping:claim(Ring); + _ -> + sequential_claim(Ring, HeadNode) + end. + + %% @doc claim/3 is used in tests as it allows for {Mod, Fun, Params} to be %% passed in as the choose function, to override selection of defaults from %% application environment for target n_vals. diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index 886b31646..111fce9ef 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -55,7 +55,7 @@ remove_from_cluster(Ring, ExitingNode, Seed) -> end, Ring, Owners), - riak_core_membership_claim:sequential_claim(TempRing, HN) + riak_core_membership_claim:full_rebalance(TempRing, HN) end, ExitRing. From 470a57fcc01de9548a3fb81fa7625b2b4b8661b8 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 23 May 2023 17:40:27 +0100 Subject: [PATCH 24/30] Support leave in prop_claim --- src/riak_core_membership_leave.erl | 55 +++++++++++++++------------ test/riak_core_claim_eqc.erl | 60 ++++++++++++++++++++++-------- 2 files changed, 77 insertions(+), 38 deletions(-) diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index 111fce9ef..e87fc03db 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -28,18 +28,34 @@ -include_lib("eunit/include/eunit.hrl"). -endif. --export([remove_from_cluster/2, remove_from_cluster/3]). +-export([remove_from_cluster/2, remove_from_cluster/3, remove_from_cluster/4]). remove_from_cluster(Ring, ExitingNode) -> - remove_from_cluster(Ring, ExitingNode, rand:seed(exrop, os:timestamp())). + remove_from_cluster( + Ring, ExitingNode, rand:seed(exrop, os:timestamp())). remove_from_cluster(Ring, ExitingNode, Seed) -> + ForceRebalance = + app_helper:get_env(riak_core, full_rebalance_onleave, false), + remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance). + + +remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance) -> % Transfer indexes to other nodes... Owners = riak_core_ring:all_owners(Ring), Members = riak_core_ring:claiming_members(Ring), + + STR = + case ForceRebalance of + true -> + force_rebalance; + false -> + attempt_simple_transfer( + Ring, ExitingNode, Seed, Owners, Members) + end, + ExitRing = - case attempt_simple_transfer(Ring, ExitingNode, Seed, - Owners, Members) of + case STR of {ok, NR} -> NR; _ -> @@ -84,25 +100,18 @@ remove_from_cluster(Ring, ExitingNode, Seed) -> target_n_fail| force_rebalance. attempt_simple_transfer(Ring, ExitingNode, Seed, Owners, Members) -> - ForceRebalance = - app_helper:get_env(riak_core, full_rebalance_onleave, false), - case ForceRebalance of - true -> - force_rebalance; - false -> - TargetN = app_helper:get_env(riak_core, target_n_val), - Counts = - riak_core_membership_claim:get_counts(Members, Owners), - RingFun = - fun(Partition, Node, R) -> - riak_core_ring:transfer_node(Partition, Node, R), - R - end, - simple_transfer(Owners, - {RingFun, TargetN, ExitingNode}, - Ring, - {Seed, [], Counts}) - end. + TargetN = app_helper:get_env(riak_core, target_n_val), + Counts = + riak_core_membership_claim:get_counts(Members, Owners), + RingFun = + fun(Partition, Node, R) -> + riak_core_ring:transfer_node(Partition, Node, R), + R + end, + simple_transfer(Owners, + {RingFun, TargetN, ExitingNode}, + Ring, + {Seed, [], Counts}). %% @doc Simple transfer of leaving node's vnodes to safe place %% Iterates over Owners, which must be sorted by Index (from 0...), and diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index d4be6498f..121d81e68 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -35,6 +35,7 @@ committed_nodes = [], staged_nodes = [] :: [Name :: atom()], %% nodes added/left before claim, plan = [], %% staged nodes after claim + leaving_nodes = [], sufficient = false, with_location = false }). @@ -168,18 +169,35 @@ claim_args(S) -> %% v2 does not take leaving nodes into account, but the model does [elements([v4]), S#state.nval]. -claim(Ring, default, Nval) -> - pp(riak_core_membership_claim, claim, [Ring, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_membership_claim, sequential_claim, Nval}]); -claim(Ring, v2, Nval) -> - pp(riak_core_membership_claim, claim, [Ring, +claim(Ring, Algo, Nval) -> + InitialRemoveRing = + case riak_core_ring:members(Ring, [leaving]) of + [] -> + Ring; + LeavingNodes -> + lists:foldl( + fun(RN, R) -> + riak_core_membership_leave:remove_from_cluster( + R, RN, rand:seed(exrop, os:timestamp()), true) + end, + Ring, + LeavingNodes + ) + end, + case Algo of + v4 -> + pp(riak_core_membership_claim, claim, [InitialRemoveRing, {riak_core_membership_claim, wants_claim_v2}, - {riak_core_membership_claim, choose_claim_v2, [{target_n_val, Nval}]}]); -claim(Ring, v4, Nval) -> - pp(riak_core_membership_claim, claim, [Ring, + {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]); + v2 -> + pp(riak_core_membership_claim, claim, [InitialRemoveRing, {riak_core_membership_claim, wants_claim_v2}, - {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]). + {riak_core_membership_claim, choose_claim_v2, [{target_n_val, Nval}]}]); + default -> + pp(riak_core_membership_claim, claim, [InitialRemoveRing, + {riak_core_membership_claim, default_wants_claim}, + {riak_core_membership_claim, sequential_claim, Nval}]) + end. claim_pre(#state{sufficient = true} = S, [v4, _Nval]) -> %% Sufficient conditions to actually succeed @@ -188,7 +206,6 @@ claim_pre(_, [_, _]) -> true. - claim_next(S, NewRing, [_, _]) -> S#state{ring = NewRing, plan = S#state.staged_nodes, staged_nodes = []}. @@ -314,7 +331,9 @@ to_config(S) -> %% --- Operation: leave_node --- leave_node_pre(S) -> - length(S#state.nodes) > 1 andalso S#state.committed_nodes/= []. %% try > 1 not to delete the initial node + length(S#state.nodes) > 1 %% try > 1 not to delete the initial node + andalso S#state.committed_nodes/= [] + andalso S#state.plan == []. leave_node_args(S) -> %% TODO consider re-leaving leaved nodes @@ -351,9 +370,20 @@ commit_args(S) -> commit(Ring, Claimant) -> JoiningNodes = riak_core_ring:members(Ring, [joining]), %% [ Node || {Node, joining} <- riak_core_ring:all_member_status(Ring) ], - lists:foldl(fun(Node, R) -> - riak_core_ring:set_member(Claimant, R, Node, valid, same_vclock) - end, Ring, JoiningNodes). + Ring0 = + lists:foldl( + fun(Node, R) -> + riak_core_ring:set_member(Claimant, R, Node, valid, same_vclock) + end, + Ring, + JoiningNodes), + LeavingNodes = riak_core_ring:members(Ring, [leaving]), + lists:foldl( + fun(Node, R) -> + riak_core_ring:remove_member(Claimant, R, Node) + end, + Ring0, + LeavingNodes). commit_next(S, NewRing, [_]) -> S#state{ring = NewRing, staged_nodes = [], plan = [], committed_nodes = S#state.nodes}. From c6a3dd58e8d6e01116b0cccff8d6fb1dd421fad6 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 23 May 2023 20:05:11 +0100 Subject: [PATCH 25/30] Update - to use correct claim_fun on leave Property temporarily changed to consider only failures with locations --- src/riak_core_claim_swapping.erl | 43 ++++++++++++++++++++++++++++++ src/riak_core_membership_claim.erl | 10 ++++--- src/riak_core_membership_leave.erl | 11 +++++--- test/riak_core_claim_eqc.erl | 28 +++++++++---------- 4 files changed, 68 insertions(+), 24 deletions(-) diff --git a/src/riak_core_claim_swapping.erl b/src/riak_core_claim_swapping.erl index 7d639ec07..29452efbd 100644 --- a/src/riak_core_claim_swapping.erl +++ b/src/riak_core_claim_swapping.erl @@ -390,6 +390,49 @@ location_t10_test_() -> location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 4, 3) ]}}. +location_t11_test_() -> + JoiningNodes = + [{l2n2, loc2}, {l3n3, loc3}, {l3n4, loc3}, {l4n5, loc4}, {l5n6, loc5}], + {"[1, 1, 2, 1, 1] nval 3 location nval 3", + {inparallel, + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 32, 3, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 3, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 3, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 3, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 3, 3), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 3, 3) + ]}}. + +location_t12_test_() -> + JoiningNodes = + [{l2n2, loc2}, {l3n3, loc3}, {l4n4, loc4}, {l5n5, loc5}, {l5n6, loc6}], + {"[1, 1, 1, 1, 1, 1] nval 5 location nval 5", + {inparallel, + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 32, 5, 5), + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 5, 5), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 5, 5), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 5, 5), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 5, 5), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 5, 5) + ]}}. + +location_t13_test_() -> + JoiningNodes = + [{l1n2, loc1}, {l2n3, loc2}, {l3n4, loc3}, {l4n5, loc4}], + {"[2, 1, 1, 1] nval 2 location nval 2", + {inparallel, + [ + location_claim_tester(l1n1, loc1, JoiningNodes, 32, 2, 2), + location_claim_tester(l1n1, loc1, JoiningNodes, 64, 2, 2), + location_claim_tester(l1n1, loc1, JoiningNodes, 256, 2, 2), + location_claim_tester(l1n1, loc1, JoiningNodes, 512, 2, 2), + location_claim_tester(l1n1, loc1, JoiningNodes, 1024, 2, 2), + location_claim_tester(l1n1, loc1, JoiningNodes, 2048, 2, 2) + ]}}. + + location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN) -> location_claim_tester(N1, N1Loc, NodeLocList, RingSize, TargetN, TargetN). diff --git a/src/riak_core_membership_claim.erl b/src/riak_core_membership_claim.erl index 3af0e8785..43adbe307 100644 --- a/src/riak_core_membership_claim.erl +++ b/src/riak_core_membership_claim.erl @@ -52,7 +52,7 @@ %% default is 4. -module(riak_core_membership_claim). --export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4, full_rebalance/2]). +-export([claim/1, claim/3, claim_until_balanced/2, claim_until_balanced/4, full_rebalance/3]). -export([default_wants_claim/1, default_wants_claim/2, default_choose_claim/1, default_choose_claim/2, default_choose_claim/3, default_choose_params/0, default_choose_params/1]). @@ -90,6 +90,7 @@ {module(), atom()}|{module(), atom(), list(tuple())}. -type delta() :: {node(), Ownership::non_neg_integer(), Delta::integer()}. -type deltas() :: [delta()]. +-type choose_fun() :: choose_claim_v2|choose_claim_v3|choose_claim_v4. %% =================================================================== %% Claim API and supporting functions @@ -113,9 +114,10 @@ claim(Ring) -> claim(Ring, Want, Choose). -spec full_rebalance( - riak_core_ring:riak_core_ring(), node()) -> riak_core_ring:riak_core_ring(). -full_rebalance(Ring, HeadNode) -> - case app_helper:get_env(riak_core, choose_claim_fun) of + riak_core_ring:riak_core_ring(), node(), choose_fun()|undefined) + -> riak_core_ring:riak_core_ring(). +full_rebalance(Ring, HeadNode, ChooseFun) -> + case ChooseFun of choose_claim_v2 -> sequential_claim(Ring, HeadNode); choose_claim_v3 -> diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index e87fc03db..9264aabf2 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -28,7 +28,7 @@ -include_lib("eunit/include/eunit.hrl"). -endif. --export([remove_from_cluster/2, remove_from_cluster/3, remove_from_cluster/4]). +-export([remove_from_cluster/2, remove_from_cluster/3, remove_from_cluster/5]). remove_from_cluster(Ring, ExitingNode) -> remove_from_cluster( @@ -37,10 +37,12 @@ remove_from_cluster(Ring, ExitingNode) -> remove_from_cluster(Ring, ExitingNode, Seed) -> ForceRebalance = app_helper:get_env(riak_core, full_rebalance_onleave, false), - remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance). + ChooseFun = + app_helper:get_env(riak_core, choose_claim_fun), + remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance, ChooseFun). -remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance) -> +remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance, ChooseFun) -> % Transfer indexes to other nodes... Owners = riak_core_ring:all_owners(Ring), Members = riak_core_ring:claiming_members(Ring), @@ -71,7 +73,8 @@ remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance) -> end, Ring, Owners), - riak_core_membership_claim:full_rebalance(TempRing, HN) + riak_core_membership_claim:full_rebalance( + TempRing, HN, ChooseFun) end, ExitRing. diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index 121d81e68..1bea8592f 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -171,14 +171,14 @@ claim_args(S) -> claim(Ring, Algo, Nval) -> InitialRemoveRing = - case riak_core_ring:members(Ring, [leaving]) of - [] -> + case {riak_core_ring:members(Ring, [leaving]), Algo} of + {[], _} -> Ring; - LeavingNodes -> + {LeavingNodes, v4} -> lists:foldl( fun(RN, R) -> riak_core_membership_leave:remove_from_cluster( - R, RN, rand:seed(exrop, os:timestamp()), true) + R, RN, rand:seed(exrop, os:timestamp()), true, choose_claim_v4) end, Ring, LeavingNodes @@ -186,17 +186,11 @@ claim(Ring, Algo, Nval) -> end, case Algo of v4 -> - pp(riak_core_membership_claim, claim, [InitialRemoveRing, - {riak_core_membership_claim, wants_claim_v2}, - {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]); - v2 -> - pp(riak_core_membership_claim, claim, [InitialRemoveRing, - {riak_core_membership_claim, wants_claim_v2}, - {riak_core_membership_claim, choose_claim_v2, [{target_n_val, Nval}]}]); - default -> - pp(riak_core_membership_claim, claim, [InitialRemoveRing, - {riak_core_membership_claim, default_wants_claim}, - {riak_core_membership_claim, sequential_claim, Nval}]) + pp(riak_core_membership_claim, + claim, + [InitialRemoveRing, + {riak_core_membership_claim, wants_claim_v2}, + {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]) end. claim_pre(#state{sufficient = true} = S, [v4, _Nval]) -> @@ -420,13 +414,15 @@ prop_claim(Options) -> undefined -> ets:new(timing, [public, named_table, bag]); _ -> ok end, - ?FORALL({Nval, RingSize, WithLocation}, {choose(2, 5), ringsize(), bool()}, + ?FORALL({Nval, RingSize, WithLocation}, {choose(2, 5), ringsize(), true}, ?FORALL(Cmds, commands(?MODULE, initial_state(#{nval => Nval, ring_size => RingSize, sufficient => Relaxed, with_location => WithLocation})), begin put(ring_nr, 0), + % application:set_env(riak_core, full_rebalance_onleave, true), + % application:set_env(riak_core, choose_claim_fun, choose_claim_v4), {H, S, Res} = run_commands(Cmds), Config = lists:sort(to_config(S)), measure(length, commands_length(Cmds), From a49697cc69a3669f3b77250a12ee1f25759d0665 Mon Sep 17 00:00:00 2001 From: Thomas Arts Date: Thu, 25 May 2023 12:07:56 +0200 Subject: [PATCH 26/30] Use application env to read target_n_val (#1007) * Use application env to read target_n_val * Re-introduce v2 in riak_core_claim_eqc * Move precondition to postcondition to also test less perfect cases * Fixed error in transfer_node usage * cleanup not using remove_from_cluster/5. --- src/riak_core_claim_binring_alg.erl | 12 +++++++- src/riak_core_membership_leave.erl | 5 ++-- test/riak_core_claim_eqc.erl | 43 +++++++++++++++++++---------- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index a950db49d..f6b88f9b5 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -437,10 +437,13 @@ worth_brute_force(RingSize, V) -> maybe_brute_force(Ring, NVals, Options) -> case worth_brute_force(ring_size(Ring), violations(Ring, NVals)) of brute_force -> + ?debug("with nval ~p maybe brute force chose brute_force (~p) \n", [NVals, Options]), brute_force(Ring, NVals, Options); node_only -> + ?debug("with nval ~p maybe brute force chose node_only (~p) \n", [NVals, Options]), brute_force(Ring, NVals, [node_only|Options]); no_brute_force -> + ?debug("with nval ~p maybe brute force chose no_brute_force (~p)\n", [NVals, Options]), Ring end. @@ -574,7 +577,14 @@ update(OldRing, Config, NValsMap) -> ToRemove = OldNodes -- NewNodes, %% Swap in new nodes for old nodes (in a moderately clever way) NewRing = swap_in_nodes(OldRing, ToAdd, ToRemove, NVals), - maybe_brute_force(NewRing, NVals, [{only_swap, true}]). + case node_v(violations(NewRing, NVals)) > max(64, RingSize div 3) of + true -> + %% Heuristics. For larger rings, if a third of the ring is misplaced, we can equally well just + %% start over by solving from start in next phase + NewRing; + false -> + maybe_brute_force(NewRing, NVals, [swap_only]) + end. swap_in_nodes(Ring, [], [], _NVals) -> Ring; swap_in_nodes(Ring, [New | ToAdd], ToRemove, NVals) -> diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index 9264aabf2..842aa0507 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -28,7 +28,7 @@ -include_lib("eunit/include/eunit.hrl"). -endif. --export([remove_from_cluster/2, remove_from_cluster/3, remove_from_cluster/5]). +-export([remove_from_cluster/2, remove_from_cluster/3]). remove_from_cluster(Ring, ExitingNode) -> remove_from_cluster( @@ -108,8 +108,7 @@ attempt_simple_transfer(Ring, ExitingNode, Seed, Owners, Members) -> riak_core_membership_claim:get_counts(Members, Owners), RingFun = fun(Partition, Node, R) -> - riak_core_ring:transfer_node(Partition, Node, R), - R + riak_core_ring:transfer_node(Partition, Node, R) end, simple_transfer(Owners, {RingFun, TargetN, ExitingNode}, diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index 1bea8592f..5b62b66a7 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -167,18 +167,25 @@ claim_pre(S) -> claim_args(S) -> %% v2 does not take leaving nodes into account, but the model does - [elements([v4]), S#state.nval]. + [elements([v2, v4]), S#state.nval]. claim(Ring, Algo, Nval) -> + case Algo of + v4 -> + application:set_env(riak_core, choose_claim_fun, choose_claim_v4), + application:set_env(riak_core, full_rebalance_onleave, true); + v2 -> + application:set_env(riak_core, choose_claim_fun, choose_claim_v2), + application:set_env(riak_core, full_rebalance_onleave, false) + end, InitialRemoveRing = case {riak_core_ring:members(Ring, [leaving]), Algo} of {[], _} -> Ring; - {LeavingNodes, v4} -> + {LeavingNodes, _} -> lists:foldl( fun(RN, R) -> - riak_core_membership_leave:remove_from_cluster( - R, RN, rand:seed(exrop, os:timestamp()), true, choose_claim_v4) + pp(riak_core_membership_leave, remove_from_cluster, [R, RN]) end, Ring, LeavingNodes @@ -190,12 +197,15 @@ claim(Ring, Algo, Nval) -> claim, [InitialRemoveRing, {riak_core_membership_claim, wants_claim_v2}, - {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]) + {riak_core_claim_swapping, choose_claim_v4, [{target_n_val, Nval}]}]); + v2 -> + pp(riak_core_membership_claim, + claim, + [InitialRemoveRing, + {riak_core_membership_claim, wants_claim_v2}, + {riak_core_membership_claim, choose_claim_v2, [{target_n_val, Nval}]}]) end. -claim_pre(#state{sufficient = true} = S, [v4, _Nval]) -> - %% Sufficient conditions to actually succeed - sufficient_conditions(S); claim_pre(_, [_, _]) -> true. @@ -203,7 +213,7 @@ claim_pre(_, [_, _]) -> claim_next(S, NewRing, [_, _]) -> S#state{ring = NewRing, plan = S#state.staged_nodes, staged_nodes = []}. -claim_post(#state{nval = Nval, ring_size = RingSize, nodes = Nodes} = S, [_, _], NewRing) -> +claim_post(#state{nval = Nval, ring_size = RingSize, nodes = Nodes} = S, [Algo, _], NewRing) -> Preflists = riak_core_ring:all_preflists(NewRing, Nval), LocNval = if Nval > 3 -> Nval - 1; true -> Nval end, @@ -243,10 +253,10 @@ claim_post(#state{nval = Nval, ring_size = RingSize, nodes = Nodes} = S, [_, _], eqc_statem:tag(node_count, eq(RiakNodeCount, length(Nodes))), eqc_statem:tag(meets_target_n, eq(riak_core_membership_claim:meets_target_n(NewRing, Nval), {true, []})), eqc_statem:tag(correct_nodes, eq(chash:members(riak_core_ring:chash(NewRing)), lists:sort(Nodes))), - eqc_statem:tag(perfect_pls, eq(ImperfectPLs, [])), - eqc_statem:tag(perfect_locations, eq(ImperfectLocations, [])), + eqc_statem:tag(perfect_pls, eq(ImperfectPLs, [])) ] ++ + [ eqc_statem:tag(perfect_locations, eq(ImperfectLocations, [])) || Algo == v4 andalso sufficient_conditions(S)] ++ %% eqc_statem:tag(few_moves, length(S#state.committed_nodes) =< 1 orelse length(diff_nodes(S#state.ring, NewRing)) < S#state.ring_size div 2), - eqc_statem:tag(balanced_ring, BalancedRing)]). + [ eqc_statem:tag(balanced_ring, BalancedRing) ]). claim_features(#state{nodes = Nodes} = S, [Alg, _], Res) -> [{claimed_nodes, length(Nodes)}, @@ -335,8 +345,8 @@ leave_node_args(S) -> S#state.with_location, S#state.claimant]. -leave_node_pre(#state{nodes=_Nodes}, [Node, _, Claimant]) -> - Claimant /= Node. %% andalso lists:member(Node, Nodes). +leave_node_pre(#state{nodes = Nodes}, [Node, _, Claimant]) -> + Claimant /= Node andalso lists:member(Node, Nodes). leave_node(Ring, NodeName, _WithLocation, Claimant) -> pp(riak_core_ring, leave_member, [Claimant, Ring, NodeName]). @@ -395,6 +405,8 @@ weight(S, add_located_nodes) when S#state.with_location-> 0; weight(S, leave_node) -> 1 + (length(S#state.committed_nodes) div 4); +weight(S, claim) when S#state.nodes /= [] -> + 10; weight(_S, _Cmd) -> 1. @@ -414,7 +426,7 @@ prop_claim(Options) -> undefined -> ets:new(timing, [public, named_table, bag]); _ -> ok end, - ?FORALL({Nval, RingSize, WithLocation}, {choose(2, 5), ringsize(), true}, + ?FORALL({Nval, RingSize, WithLocation}, {choose(2, 5), ringsize(), bool()}, ?FORALL(Cmds, commands(?MODULE, initial_state(#{nval => Nval, ring_size => RingSize, sufficient => Relaxed, @@ -423,6 +435,7 @@ prop_claim(Options) -> put(ring_nr, 0), % application:set_env(riak_core, full_rebalance_onleave, true), % application:set_env(riak_core, choose_claim_fun, choose_claim_v4), + application:set_env(riak_core, target_n_val, Nval), {H, S, Res} = run_commands(Cmds), Config = lists:sort(to_config(S)), measure(length, commands_length(Cmds), From 60e719997094bb5b1ccb9c8fd8e9796e0211d77f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 25 May 2023 17:57:31 +0100 Subject: [PATCH 27/30] Add warning if simple_transfer produces unbalanced result In this case - full_rebalance should be enabled --- src/riak_core_membership_leave.erl | 81 +++++++++++++++++++++++++++++- test/riak_core_claim_eqc.erl | 2 +- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/src/riak_core_membership_leave.erl b/src/riak_core_membership_leave.erl index 842aa0507..782faa15b 100644 --- a/src/riak_core_membership_leave.erl +++ b/src/riak_core_membership_leave.erl @@ -59,7 +59,16 @@ remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance, ChooseFun) -> ExitRing = case STR of {ok, NR} -> - NR; + case check_balanced_or_wanting(NR) of + true -> + NR; + false -> + lager:warning( + "Leave produced unbalanced ring with no wants " + "Consider enabling full_rebalance_onleave" + ), + NR + end; _ -> %% re-diagonalize %% first hand off all claims to *any* one else, @@ -84,6 +93,28 @@ remove_from_cluster(Ring, ExitingNode, Seed, ForceRebalance, ChooseFun) -> -type transfer_ring() :: riak_core_ring:riak_core_ring(). -endif. +-spec check_balanced_or_wanting( + riak_core_ring:riak_core_ring()) -> boolean(). +check_balanced_or_wanting(Ring) -> + Owners = riak_core_ring:all_owners(Ring), + Members = riak_core_ring:claiming_members(Ring), + MinVal = length(Owners) div length(Members), + MaxVal = + case length(Owners) rem length(Members) of + 0 -> + MinVal; + _ -> + MinVal + 1 + end, + Counts = riak_core_membership_claim:get_counts(Members, Owners), + case lists:ukeysort(2, Counts) of + [{_N, _C}] -> + true; + [{_LN, LC}|Rest] -> + [{_HN, HC}|_Rest] = lists:reverse(Rest), + (HC =< MaxVal) or (LC < MinVal) + end. + %% @doc Simple transfer of leaving node's vnodes to safe place %% Where safe place is any node that satisfies target_n_val for that vnode - %% but with a preference to transfer to a node that has a lower number of @@ -319,6 +350,11 @@ transfer_needstobesorted_tester(I) -> ?assertMatch({13, n4}, lists:keyfind(13, 1, R1)). simple_transfer_evendistribution_test() -> + % This results in an uneven distribution + % After the remove, claim will be called. For this to lead to a + % balanced ring there must still be Wants, that will allow for claim to + % run. + R0 = [{0, n1}, {1, n2}, {2, n3}, {3, n4}, {4, n5}, {5, n6}, {6, n7}, {7, n8}, {8, n9}, {9, n10}, {10, n1}, {11, n2}, {12, n3}, {13, n4}, {14, n5}, @@ -347,7 +383,48 @@ simple_transfer_evendistribution_test() -> io:format("NodeCounts ~w~n", [NodeCounts]), [{_LN, LC}|Rest] = NodeCounts, [{_HN, HC}|_] = lists:reverse(Rest), - true = HC - LC == 2. + true = HC - LC == 2, + true = LC < (length(R1) div 9). + + simple_cluster_t1_test() -> + RingSize = 32, + TargetN = 4, + NodeList = [n1, n2, n3, n4, n5, n6], + R0 = riak_core_ring:fresh(RingSize, n1), + ?assert(check_balanced_or_wanting(R0)), + R1 = + lists:foldl( + fun(N, AccR) -> riak_core_ring:add_member(n1, AccR, N) end, + R0, + NodeList -- [n1]), + ?assert(check_balanced_or_wanting(R1)), + Props = [{target_n_val, TargetN}], + RClaim = + riak_core_claim_swapping:claim(R1, Props), + ?assert( + element( + 1, + riak_core_membership_claim:meets_target_n(RClaim, TargetN))), + ?assert(check_balanced_or_wanting(RClaim)), + Owners = riak_core_ring:all_owners(RClaim), + Members = riak_core_ring:claiming_members(RClaim), + Counts = + lists:keysort( + 2, + riak_core_membership_claim:get_counts(Members, Owners)), + {LN, LC} = lists:nth(1, Counts), + {PN, HC} = lists:nth(5, Counts), + {HN, HC} = lists:nth(6, Counts), + ?assert(LC == 5), + ?assert(HC == 6), + {LP, LN} = lists:keyfind(LN, 2, Owners), + {HP, HN} = lists:keyfind(HN, 2, Owners), + RingWithWants = + riak_core_ring:transfer_node(LP, HN, RClaim), + ?assert(check_balanced_or_wanting(RingWithWants)), + RingUnbalancedWithoutWants = + riak_core_ring:transfer_node(HP, PN, RClaim), + ?assertNot(check_balanced_or_wanting(RingUnbalancedWithoutWants)). -endif. diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index 5b62b66a7..3d66bc107 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -176,7 +176,7 @@ claim(Ring, Algo, Nval) -> application:set_env(riak_core, full_rebalance_onleave, true); v2 -> application:set_env(riak_core, choose_claim_fun, choose_claim_v2), - application:set_env(riak_core, full_rebalance_onleave, false) + application:set_env(riak_core, full_rebalance_onleave, true) end, InitialRemoveRing = case {riak_core_ring:members(Ring, [leaving]), Algo} of From 57675fbb69c0932527ef58348c7828d10c157b43 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Fri, 26 May 2023 18:51:36 +0100 Subject: [PATCH 28/30] only_swap/swap_only confusion Add recommendation to use full_rebalance_on_leave for locations --- priv/riak_core.schema | 3 +++ src/riak_core_claim_binring_alg.erl | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/priv/riak_core.schema b/priv/riak_core.schema index f85b2c61c..4c799a263 100644 --- a/priv/riak_core.schema +++ b/priv/riak_core.schema @@ -314,6 +314,9 @@ %% all nodes. %% Please carefully consider any cluster plan created with this option before %% committing +%% If cluster planning with locations enabled, then `full_rebalance_onleave` +%% should also be enabled. With claim_v4 this should result in a cluster +%% plan which is correct, but also relatively efficient. {mapping, "full_rebalance_onleave", "riak_core.full_rebalance_onleave", [ {datatype, flag}, {default, off} diff --git a/src/riak_core_claim_binring_alg.erl b/src/riak_core_claim_binring_alg.erl index f6b88f9b5..9dfff2bdc 100644 --- a/src/riak_core_claim_binring_alg.erl +++ b/src/riak_core_claim_binring_alg.erl @@ -583,7 +583,7 @@ update(OldRing, Config, NValsMap) -> %% start over by solving from start in next phase NewRing; false -> - maybe_brute_force(NewRing, NVals, [swap_only]) + maybe_brute_force(NewRing, NVals, [{only_swap, true}]) end. swap_in_nodes(Ring, [], [], _NVals) -> Ring; From bf1e668ddb73ca8dfd906d44ae1803ead90e8ae4 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Sat, 27 May 2023 23:26:05 +0200 Subject: [PATCH 29/30] Update riak_core_claim_eqc.erl --- test/riak_core_claim_eqc.erl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/riak_core_claim_eqc.erl b/test/riak_core_claim_eqc.erl index 3d66bc107..cc55db303 100644 --- a/test/riak_core_claim_eqc.erl +++ b/test/riak_core_claim_eqc.erl @@ -16,6 +16,8 @@ -module(riak_core_claim_eqc). +-ifdef(EQC). + -include_lib("eqc/include/eqc.hrl"). -include_lib("eqc/include/eqc_statem.hrl"). @@ -942,3 +944,5 @@ solution_list() -> {512,[5,5,4,4,5,1],#{location => 2,node => 2}}, {512,[5,6,4,3],#{location => 3,node => 3}}, {512,[6,5,4,4],#{location => 2,node => 2}}]. + +-endif. \ No newline at end of file From 5d8912f8cdf44e3c99205258416868d32eee1b76 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 12 Jun 2023 12:36:16 +0100 Subject: [PATCH 30/30] Mas i1001 docupdate (#1009) * Change doc Change introduction to refer to vnodes and nodes. Removes the recommendation not to vary location_n_val and node_n_val. * Update comments on having different target n_vals * Further doc updates * Update docs/claim-version4.md * Update docs/claim-version4.md * Update docs/claim-version4.md Co-authored-by: Thomas Arts * Update docs/claim-version4.md Co-authored-by: Thomas Arts --------- Co-authored-by: Thomas Arts --- docs/claim-version4.md | 270 +++++++++++------------------------------ 1 file changed, 68 insertions(+), 202 deletions(-) diff --git a/docs/claim-version4.md b/docs/claim-version4.md index 11040b4ca..d55235e92 100644 --- a/docs/claim-version4.md +++ b/docs/claim-version4.md @@ -1,104 +1,71 @@ # Riak Core Claim Version 4 -This post is about a new version of riak core's claim algorithm -[riak-core's](https://github.com/basho/riak_core). +This post is about a new version of [riak-core's](https://github.com/basho/riak_core) claim algorithm. -An earlier post of [Russell](https://github.com/basho/riak_core/blob/develop/docs/claim-fixes.md) -describes the present claim algorithm (version 2) in detail. That post is -mainly about fixes performed to make things work with so called tail violations. +An earlier post of [Russell](https://github.com/basho/riak_core/blob/develop/docs/claim-fixes.md) describes the present claim algorithm (version 2) in detail. That post is mainly about fixes performed to make things work with so called tail violations. -Recent enhancement of riak core in the form of [location awareness](https://github.com/basho/riak_core/blob/develop/docs/rack-awareness.md) -have made it interesting to revisit and redesign the claim algorithm. +Recent enhancement of riak core in the form of [location awareness](https://github.com/basho/riak_core/blob/develop/docs/rack-awareness.md) have made it interesting to revisit and redesign the claim algorithm. -Recapitulate from earlier posts that Riak is a replicated database. By default it stores three replicas of every key/value. The replication factor is called n-val meaning there are n replicas, and by default n=3. If value has to be stored or retrieved, then the hash of that corresponds to one particular node in the ring. It is stored in that node and the n-1 next nodes in the ring. When a node is unavailable, a value will be read from the next node. +Riak is a replicated database. By default it stores three replicas of every key/value. The replication factor is called n-val meaning there are n replicas. If value has to be stored or retrieved, then the hash of that corresponds to one particular vnode in the ring (the mth vnode). It is stored in that vnode and the n-1 next vnodes in the ring. If any of the vnodes is unavailable (i.e. due to a failure on the node which hosts it), the node which owns the next vnode in the ring (i.e. the m + n vnode) is required to start a fallback vnode as a replacement. -Therefore it is important to put the nodes in a ring such that n-val consecutive nodes are not the same physical node. After all, if a physical node is down and the next node in the ring maps to the same physical node, then there is little redundancy. +Therefore it is important to allocate the vnodes in the ring to the nodes in a cluster, such that n-val + 1 consecutive vnodes are not the same physical node. After all, if a physical node is down and the next vnode in the ring maps to the same physical node, then there is little redundancy. -Taking this idea one step further, one may imagine a perfect ring with n-val -replication, but two of the physical nodes are in the same "location", where -location can be a rack or a data center. What if something happens that disconnects -two consecutive physical nodes at once? Wouldn't it be nice if one could also -take the location into consideration when placing the nodes in the ring, such -that the ring conveniently spreads over locations? +Taking this idea one step further, one may imagine a perfect ring with n-val replication, but two of the physical nodes are in the same "location", where location can be a rack or a data center. What if something happens within that location that disconnects two consecutive physical nodes at once? Wouldn't it be nice if one could also take the location into consideration when placing the nodes in the ring, such that the each preflist in the ring conveniently spreads over locations? We came up with a way to do so. # Requirements -The solution for placing the nodes in a ring is performed by a so called claim -algorithm. The user provides the nodes, the ring size and the n-val and from that +The solution for placing the nodes in a ring is performed by a so called claim algorithm. The user provides the nodes, the ring size (i.e. the number of vnodes) and the target n-val and from that a mapping is returned that tries to fulfil the following: 1. the ring has exactly ring size elements. 2. all nodes occur approximately equally often: this is called _balanced_. (more precise for a given k, each node appears exactly k or k+1 times) -3. n-val consecutive nodes in the ring are all in a different location. +3. all n-val consecutive vnodes in the ring are in different locations. +4. all n-val consecutive anodes in the ring are on different nodes (which should follow if 3 is true). -Note that one node cannot be at two locations at the same time. Therefore, -the n-val for nodes follows from the n-val for locations. +It is expected that once a cluster is inititated the ring size and the target n-val will not normally change, however the number of nodes in the cluster will be changed. -The first and second requirement cannot be relaxed. But the third one is not -guaranteed to be possible at all. In fact, Bb using a SAT solver we identified -82 impossible configuration for ring size 16 and n-val, 2 or 3. -In case there is no solution possible, the algorithm is supposed to return a -placement that fulfils the first two requirements and does some kind of best effort -for the third. +The first, second and fourth requirements cannot be relaxed. But the third one is not guaranteed to be possible at all. In fact, By using a SAT solver we identified +82 impossible configuration for ring size 16 and n-val, 2 or 3. In case there is no solution possible, the algorithm is supposed to return a placement that fulfils the first two requirements and does some kind of best effort for the third. -In principle one can imagine to have an n-val for locations that is less than -the n-val for nodes, but although the presented algorithm supports this, we -recommend not to use that feature, since it may give unexpected placements that -may well behave worst than using the same but a smaller n-val. +In cases where the third requirement cannot be supported for a given n-val, it should also be possible have an n-val for locations that is less than the n-val for nodes, to create a relaxed condition which can then be met. # Computing placements -We start of by presenting some examples to illustrate how the algorithm can be used. -The algorithm in [riak_core_claim_binring_alg](https://github.com/basho/riak_core/blob/develop/src/riak_core_claim_binring_alg.erl) is the core algorithm. You will most likely not -use the API of this module directly, but by isolating the algorithm, development -and testing becomes easier. -There are basically 2 API functions that matter: `solve` and `update`. -Central is the input __configuration__, presented as a list of integers, where Each -element in the list represents a location and the number represent the number of -nodes in that location. For example, `[1,1,2]` represents 3 locations, (A, B, and C, say) -such that the first and second location have 1 node and the third location has 2 nodes. +We start of by presenting some examples to illustrate how the algorithm can be used. The algorithm in [riak_core_claim_binring_alg](https://github.com/basho/riak_core/blob/develop/src/riak_core_claim_binring_alg.erl) is the core algorithm. You will most likely no use the API of this module directly, but by isolating the algorithm, development and testing becomes easier. + +There are basically 2 API functions that matter: `solve` and `update`. Central is the input __configuration__, presented as a list of integers, where each element in the list represents a location and the number represent the number of nodes in that location. For example, `[1,1,2]` represents 3 locations, (A, B, and C, say) such that the first and second location have 1 node and the third location has 2 nodes. ## Solve -Solving starts from a ring size, an n-val and a configuration and provides a -binary representation of a ring with a placement that fits that configuration. -Consider the binary representation as an opaque type, because there is no need to inspect it. -An API function "show" can be used to produce a string (ansi colour coded) from such a +Solving starts from a ring size, an n-val and a configuration and provides a binary representation of a ring with a placement that fits that configuration. Consider the binary representation as an opaque type, because there is no need to inspect it. An API function "show" can be used to produce a string (ansi colour coded) from such a binary ring (only exported for test and debug mode). -In _test_ mode or when you want to get a better understanding of the algorithm, -solving the above [1,1,2] for ring size 16 and n-val 2 would be done as follows: +In _test_ mode or when you want to get a better understanding of the algorithm, solving the above [1,1,2] for ring size 16 and n-val 2 would be done as follows: + ``` BinRing = riak_core_claim_binring_alg:solve(16, [1,1,2], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing, 2)]). B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 (0 violations) ``` -The location names are alphabetically and the nodes are numbered. -B1 is the first node in the second location. -By providing `show` also n-val it can return with `(0 violations)` given the + +The location names are alphabetically and the nodes are numbered. B1 is the first node in the second location. By providing `show` also n-val it can return with `(0 violations)` given the provided ring. ## Update -When Riak is running, then it has an existing placement of nodes and locations -in the ring. In that circumstance, one uses update to change the ring to a -new configuration. +When Riak is running, then it has an existing placement of nodes and locations in the ring. In that circumstance, one uses update to change the ring to a new configuration. ``` Disclaimer: -We have only considered updating the configuration. It would work to update the n-val. -But updating the ring size is something we have not spent brain cycles on. It might work. +We have only considered updating the configuration. It would work to update the n-val, but not the ring-size. ``` -One can add a new location with new nodes, or -add/remove nodes from existing locations. Again, a best-effort approach is provided. -In this best effort approach, the amount of transfers needed from one node to the other is -kept into consideration. +One can add a new location with new nodes, or add/remove nodes from existing locations. Again, a best-effort approach is provided. In this best effort approach, the amount of transfers needed from one node to the other is kept into consideration. ### Adding a node to a location @@ -112,56 +79,42 @@ BinRing1 = riak_core_claim_binring_alg:update(BinRing, [1,2,2], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing1, 2)]). A1 B2 A1 B2 A1 C1 B1 C2 B1 C1 B1 C2 A1 C1 B2 C2 (0 violations) ``` + Clearly, the new ring is of size 16 and is balanced (4 A1, 3 B1, 3 B2, 3 C1 and 3 C2). -It respects n-val 2, because no consecutive location is the same, not even when -we wrap around. -Another observation here is that 11 of the nodes have the same location in the -ring. Clearly, some transfer is needed, but if we had used the `solve` approach to -compute the new ring, we would have been presented with: +It respects n-val 2, because no consecutive location is the same, not even when we wrap around. + +Another observation here is that 11 of the nodes have the same location in the ring. Clearly, some transfer is needed, but if we had used the `solve` approach to compute the new ring, we would have been presented with: ``` A1 B1 C1 A1 B2 C2 B1 C1 A1 B2 C2 B1 C1 A1 B2 C2 ``` -In which only 4 nodes have the same place in the ring. -*Minimising the number of needed transfers* is the main reason for having the -`update` function. +In which only 4 nodes have the same place in the ring. *Minimising the number of needed transfers* is the main reason for having the `update` function. ### Remove a node from a location (leave) -We can use the same update function to remove a node from a location, which in -Riak terms is called a "leave". The node is removed from the ring data structure, -but the process of copying the data to create a new stable ring is a process -that takes time, only after which the node is actually removed. +We can use the same update function to remove a node from a location, which in Riak terms is called a "leave". The node is removed from the ring data structure, but the process of copying the data to create a new stable ring is a process that takes time, only after which the node is actually removed. -Assume we want to remove the node we have just added above. In other words, we -return to the initial configuration `[1, 1, 2]`: +Assume we want to remove the node we have just added above. In other words, we return to the initial configuration `[1, 1, 2]`: ``` BinRing2 = riak_core_claim_binring_alg:update(BinRing1, [1,1,2], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing2, 2)]). B1 C2 A1 C1 A1 C1 B1 C2 B1 C1 B1 C2 A1 C1 A1 C2 (0 violations) ``` -This does not give the same ring as the original placement, but close. -In order to minimise transfers, 12 nodes keep their position. +This does not give the same ring as the original placement, but close. In order to minimise transfers, 12 nodes keep their position. ### Leave a location -In theory we can also add and leave nodes in one go. This is probably not something -one would like to do in operation, but the algorithm allows it. +In theory we can also add and leave nodes in one go. This is probably not something one would like to do in operation, but the algorithm allows it. -For example if we update the ring above by moving one of the single nodes to -the other location with a single node: +For example if we update the ring above by moving one of the single nodes to the other location with a single node: ``` NewBinRing = riak_core_claim_binring_alg:update(BinRing, [2,2], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(NewBinRing, 2)]). B1 A2 B2 A1 B2 A1 B2 A2 B1 A1 B2 A2 B1 A1 B1 A2 (0 violations) ``` -But that result is confusing, because now we have location A and B, but the -intention was to keep location C and move a node from B to A (or alternatively from A to B). +But that result is confusing, because now we have location A and B, but the intention was to keep location C and move a node from B to A (or alternatively from A to B). -We can patch the confusion in the [Embedding layer using this algorithm](#embedding-the-algorithm-in-riak-core) -where we translate real node names and translations back and forth to these configurations. -But that layer becomes easier if we actually state our intentions clearly and -have a layer with zero nodes in the provided configuration: +We can patch the confusion in the [Embedding layer using this algorithm](#embedding-the-algorithm-in-riak-core) where we translate real node names and translations back and forth to these configurations. But that layer becomes easier if we actually state our intentions clearly and have a layer with zero nodes in the provided configuration: ``` NewBinRing = riak_core_claim_binring_alg:update(BinRing, [2,0,2], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(NewBinRing, 2)]). @@ -175,65 +128,41 @@ we see that the nodes in location C have not changed, but that B1 is replaced by # Embedding the algorithm in riak core -In Riak the claim algorithm is configurable via `wants_claim_fun` and `choose_claim_fun`. -In order to run with this new algorithm, one should configure `choose_claim_fun` -to `choose_claim_v4`. We do not use the wants function, but `riak_core_membership_claim` -requires to have one, so use the default for version 2. +In Riak the claim algorithm is configurable via `wants_claim_fun` and `choose_claim_fun`. In order to run with this new algorithm, one should configure `choose_claim_fun` to `choose_claim_v4`. We do not use the wants function, but `riak_core_membership_claim` requires to have one, so use the default for version 2. +The main entry for claim is `riak_core_membership_claim:claim/1`. This in turn calls `riak_code_claim_swapping:choose_claim_v4/3`. This is just a wrapper to come to the real API, `riak_code_claim_swapping:claim/2` which takes the present ring and the n-val as input. -The main entry for claim is `riak_core_membership_claim:claim/1`. -This in turn calls `riak_code_claim_swapping:choose_claim_v4/3`. This is just a -wrapper to come to the real API, `riak_code_claim_swapping:claim/2` which takes -the present ring and the n-val as input. - -Riak always starts from an existing ring to compute the placement -(in case of a new node, the initial consists of that same node at each position). -Therefore, we start with an update... if however `update` cannot find a solution without -violations, we fall back to `solve`. +Riak always starts from an existing ring to compute the placement (in case of a new node, the initial consists of that same node at each position). Therefore, we start with an update. if however `update` cannot find a solution without violations, we fall back to `solve`. ### Mapping node/location names -The main work in `riak_code_claim_swapping` is to map the real node names and the -real locations to the configurations we provide the algorithm. +The main work in `riak_code_claim_swapping` is to map the real node names and the real locations to the configurations we provide the algorithm. -A typical ring will contain node names as atoms and location names associated to -those atoms. For example, one could have a ring of size 16 like this: +A typical ring will contain node names as atoms and location names associated to those atoms. For example, one could have a ring of size 16 like this: ``` n2 n4 n1 n3 n1 n3 n2 n4 n1 n3 n2 n4 n1 n3 n2 n4 ``` -with the mapping `[{n1, loc1}, {n2, loc2}, {n3, loc3}, {n4, loc4}]`. -We use this to create a list of tuples with location index and node index, -something like: +with the mapping `[{n1, loc1}, {n2, loc2}, {n3, loc3}, {n4, loc4}]`. We use this to create a list of tuples with location index and node index, something like: ``` [{2, 1}, {3, 2}, {1,1}, {3,1}, {1,1}, {3,1}, {2,1}, {3,2}, {1, 1}, {3, 1}, {2,1}, {3,2}, {1,1}, {3,1}, {2,1}, {3,2}] ``` -where the second integer is the index of the node in that location. -This corresponds to: +where the second integer is the index of the node in that location. This corresponds to: ``` B1 C2 A1 C1 A1 C1 B1 C2 A1 C1 B1 C2 A1 C1 B1 C2 ``` -With the function `riak_core_claim_binring_alg:from_list` we generate -the ring in the binary form that the algorithm needs for the update function. +With the function `riak_core_claim_binring_alg:from_list` we generate the ring in the binary form that the algorithm needs for the update function. -The update function now also wants the new location, then computes, as described -above a new ring, which we translate back into a list of tuples via -`riak_core_claim_binring_alg:to_list`. +The update function now also wants the new location, then computes, as described above a new ring, which we translate back into a list of tuples via `riak_core_claim_binring_alg:to_list`. -The challenge is to make sure the right indices map to the right node names! -Because, what if we want to remove, say node `n3`. -The configuration that we compute from the riak ring object, in which the action -`leave n3` is present, is clearly `[1, 1, 1]`. -When we run update, the computed ring is: +The challenge is to make sure the right indices map to the right node names! Because, what if we want to remove, say node `n3`. The configuration that we compute from the riak ring object, in which the action `leave n3` is present, is clearly `[1, 1, 1]`. When we run update, the computed ring is: ``` R1 = riak_core_claim_binring_alg:update(BinRing, [1,1,1], 2). io:format("~s\n", [riak_core_claim_binring_alg:show(R1, 2)]). C1 B1 A1 B1 A1 C1 A1 C1 A1 C1 B1 A1 B1 C1 B1 A1 (0 violations) ``` -But it is easy to be mislead that C1 here is in fact `n4` and not `n3` as it was before. -Our solution here is to compute the mapping function together with the binary ring -in such a way that leaving nodes have a higher index than nodes that do not leave. -So, instead we use the mapping `[{loc1, [n1]}, {loc2, [n2]}, {loc3, [n4,n3]}]` to compute +But it is easy to be mislead that C1 here is in fact `n4` and not `n3` as it was before. Our solution here is to compute the mapping function together with the binary ring +in such a way that leaving nodes have a higher index than nodes that do not leave. So, instead we use the mapping `[{loc1, [n1]}, {loc2, [n2]}, {loc3, [n4,n3]}]` to compute the indexes for the binary ring, which then swaps `{3,1}` and `{3,2}`) and maps `n4` to C1: ``` [{2, 1}, {3, 1}, {1,1}, {3,2}, {1,1}, {3,2}, {2,1}, {3,1}, @@ -247,112 +176,49 @@ which easily translates back to: ``` n1 n4 n1 n2 n4 n1 n2 n4 n1 n4 n2 n1 n2 n1 n2 n4 ``` -where `n3` is indeed removed. (This typical example unfortunately requires a lot of transfers.) +where `n3` is indeed removed. (This typical example unfortunately requires a lot of transfers). # Legacy: no locations -The claim algorithms version 1 to 3 that have been used in Riak before, do not consider -locations. There the goal is to just consider the n-val for nodes. The new algorithm also -supports that, such that if you have no locations, you can use this newer -algorithm. In fact, you can just configure to use this new claim algorithm and -run as usual. The module `riak_code_claim_swapping` checks whether you have defined -locations and if not, it puts all the nodes in one location. +The claim algorithms version 1 to 3 that have been used in Riak before, do not consider locations. There the goal is to just consider the n-val for nodes. The new algorithm also +supports that, such that if you have no locations, you can use this newer algorithm. In fact, you can just configure to use this new claim algorithm and run as usual. The module `riak_code_claim_swapping` checks whether you have defined locations and if not, it puts all the nodes in one location. -Effectively, the `solve` and `update` function are called with `{NVal, 1}` instead -of `NVal` as argument, where the second element of the tuple is the location n-val. +Effectively, the `solve` and `update` function are called with `#{node => NVal, location => 1}` instead of `NVal` as argument. ``` -BinRing = riak_core_claim_binring_alg:solve(16, [4], {2,1}). +BinRing = riak_core_claim_binring_alg:solve(16, [4], #{node => 2, location => 1}). io:format("~s\n", [riak_core_claim_binring_alg:show(BinRing, {2, 1})]). A3 A1 A2 A4 A1 A2 A3 A4 A1 A2 A3 A4 A1 A2 A3 A4 (0 violations) ``` -## Do not consider different n-vals +# Different target n_vals - nodes and locations -In principle, one could use different n-val values for location and nodes, -for example use n-val 4 for nodes, but n-val 3 for locations. This, though, seems -not to have a valuable use case other than the above 1 location case. - -For example, take a configuration for which it is hard to find a solution and -therefore the best-effort approach comes back with violations. We try to find a -ring of size 32 with n-val 3 (both nodes and location) and 3 locations with 2 nodes each: -``` -BadRing = riak_core_claim_binring_alg:solve(32, [2,2,2], {3,3}). -io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 3})]). -A1 B1 C2 A1 B1 C1 A2 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (4 violations) -``` -This ring has 4 violations, but if we examine it carefully, we can see that it -satisfies n-val 2 for locations. This might be acceptable risk, although not perfect. -``` -io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 2})]). -A1 B1 C2 A1 B1 C1 A2 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (0 violations) -``` - -In this case we try to solve for the ideal case, we fail and inspect and decide -that it is good enough for a relaxed case. However, if we directly would have tried -to compute for the relaxed case, we would have found: -``` -BadRing = riak_core_claim_binring_alg:solve(32, [2,2,2], {3,2}). -io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 2})]). -A1 B1 A2 C2 A1 B1 C1 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (0 violations) -``` -But when we inspect this ring in more detail, we see that it has 8 violations -when considering the more difficult target `{3, 3}`. -``` -io:format("~s\n", [riak_core_claim_binring_alg:show(BadRing, {3, 3})]). -A1 B1 A2 C2 A1 B1 C1 B2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 A1 B1 C1 A2 B2 C2 (8 violations) -``` -Therefore, it seems that -this solution is worst than taking the best-effort solution and inspect how it -behaves for relaxed requirements. +The default setting is to have a target_n_val of 4 for nodes, and a target_n_val of 3 for locations. There will be some situations though, where a more optimal configuration would have been found by increasing the target_n_vals, and in particular matching the location and node n_val. With higher n_vals there is a higher chance of an unsolveable configuration, and when the `riak admin cluster plan` function is called the operator would be notified of these violations. In that case it will be possible to clear the plan, change these settings on the claimaint node (indicated with a `(C)` in the plan), and re-plan with alternative settings - to see if the outcome is preferable, perhaps as it reduces the number of required transfers. +The plan function under claim_v4 will always return the same answer with the same configuration. So reverting any changes and re-planning will return to the original plan. # Solvable configurations -As we see above, the algorithm may fail to satisfy the provided n-val. In fact, -there are many configurations that are simply impossible to solve. Trivially -when the number of locations is smaller than the n-val, etc. But excluding those -trivial cases, we played with a -SAT solver to find 82 *impossible configurations* with ring size 16 and n-val 2 or 3. -This resulted in some necessary requirements to be able to find a solution at all, -which we use in [QuickCheck tests](../test/riak_core_claim_eqc.erl#L261) to avoid -testing the wrong things. +As we see above, the algorithm may fail to satisfy the provided n-val. In fact, there are many configurations that are simply impossible to solve. Trivially when the number of locations is smaller than the n-val, etc. But excluding those trivial cases, we played with a SAT solver to find 82 *impossible configurations* with ring size 16 and n-val 2 or 3. + +This resulted in some necessary requirements to be able to find a solution at all, which we use in [QuickCheck tests](../test/riak_core_claim_eqc.erl#L261) to avoid testing the wrong things. -Here we present some rules of thumb for good start configurations and typically -more successful update configurations. +Here we present some rules of thumb for good start configurations and typically more successful update configurations. -In general, a larger ring size is easier to solve than a small ring size. We simply -have more play room to swap nodes to get to a solution. But note that it is more -computation intensive when the ring size grows. +In general, a larger ring size is easier to solve than a small ring size. We simply have more play room to swap nodes to get to a solution. But note that it is more computation intensive when the ring size grows. -Distributing the nodes evenly over the location makes it more likely to find a solution. -For a realistic example with ring size 512, n-val 4 and 4 locations which 3 nodes each, -we easily find a solution, similar when we put 2 nodes each in the 4 locations. -But the configuration `[3,3,3,4]` has no solution. In that case it actually works -to put the extra node in a different location. +Distributing the nodes evenly over the location makes it more likely to find a solution. For a realistic example with ring size 512, n-val 4 and 4 locations which 3 nodes each, +we easily find a solution, similar when we put 2 nodes each in the 4 locations. But the configuration `[3,3,3,4]` has no solution. In that case it actually works to put the extra node in a different location. -In general, adding an extra location and having more locations than n-val makes -it easier to find a solution. With ring size 512 and n-val 3 a solution for `[2, 2, 2, 2]` -is quickly found, but the best-effort solution for `[3, 3, 3]` has 4 violations. -So, even if there are 9 nodes in the latter configuration and only 8 in the earlier, -it is harder to find a placement. +In general, adding an extra location and having more locations than n-val makes it easier to find a solution. With ring size 512 and n-val 3 a solution for `[2, 2, 2, 2]` is quickly found, but the best-effort solution for `[3, 3, 3]` has 4 violations. So, even if there are 9 nodes in the latter configuration and only 8 in the earlier, it is harder to find a placement. # Conclusion -The new algorithm for node placement in a ring handles the case where location -is an additional property to consider in a Riak installation. It is backward -compatible with the situation in which no locations are considered at all. +The new algorithm for node placement in a ring handles the case where location is an additional property to consider in a Riak installation. It is backward compatible with the situation in which no locations are considered at all. -The algorithm handles both addition of new nodes, in same or new locations, as well -as nodes leaving the ring. +The algorithm handles both addition of new nodes, in same or new locations, as well as nodes leaving the ring. -The algorithm has an inherent high complexity and can take a long time to come up -with a solution. Since the algorithm is only used when planning a new configuration -for a Riak installation, we find it acceptable that one needs to wait upto one or -two minutes for a solution. In fact, one only needs to wait long when it is hard -to find a solution. We provided some rules of thumb to provide configurations that -are relatively easy to solve. +The algorithm has an inherent high complexity and can take a long time to come up with a solution. Since the algorithm is only used when planning a new configuration for a Riak installation, we find it acceptable that one needs to wait upto one or two minutes for a solution. In fact, one only needs to wait long when it is hard to find a solution. We provided some rules of thumb to provide configurations that are relatively easy to solve. -This algorithm will be released with the next version of Riak we create -for NHS-Digital. +This algorithm will be released with the next version of Riak we create for NHS-Digital.