From 6be124b86e566a8b8453a55c01c0a292bbd5a2a2 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 27 Sep 2013 16:32:46 +0900 Subject: [PATCH 1/8] Fixes #343 Before this patch, the riak_core_node_watcher.erl code assumed that if the riak_core_ring_events proc (a gen_event server) died, then a {gen_event_EXIT,_,_} message would be sent. However, that assumption is not correct. If it dies, we get a regular {EXIT,_,_} message. Tested by repeated use of alternating: * exit(whereis(riak_core_ring_events), kill). ... and looking at the length of the links list of process_info(whereis(riak_core_ring_events), links) -- it should be three, not two. --- src/riak_core_node_watcher.erl | 30 ++++++++++++++++++++++++------ src/riak_core_ring_events.erl | 6 +++++- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/riak_core_node_watcher.erl b/src/riak_core_node_watcher.erl index 59053d262..3699ec331 100644 --- a/src/riak_core_node_watcher.erl +++ b/src/riak_core_node_watcher.erl @@ -44,7 +44,8 @@ -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --record(state, { status = up, +-record(state, { ring_events_pid :: pid(), + status = up, services = [], health_checks = [], healths_enabled = true, @@ -172,7 +173,7 @@ init([]) -> %% Setup callback notification for ring changes; note that we use the %% supervised variation so that the callback gets removed if this process %% exits - watch_for_ring_events(), + RingEventsPid = watch_for_ring_events(), %% Watch for node up/down events net_kernel:monitor_nodes(true), @@ -180,7 +181,7 @@ init([]) -> %% Setup ETS table to track node status ets:new(?MODULE, [protected, {read_concurrency, true}, named_table]), - {ok, schedule_broadcast(#state{})}. + {ok, schedule_broadcast(#state{ring_events_pid=RingEventsPid})}. handle_call({set_bcast_mod, Module, Fn}, _From, State) -> %% Call available for swapping out how broadcasts are generated @@ -330,6 +331,10 @@ handle_info({'DOWN', Mref, _, _Pid, _Info}, State) -> {noreply, update_avsn(S3)} end; +handle_info({'EXIT', Pid, _Cause}, #state{ring_events_pid=RingEventsPid}=State) + when Pid == RingEventsPid -> + RingEventsPid2 = watch_for_ring_events(), + {noreply, State#state{ring_events_pid=RingEventsPid2}}; handle_info({'EXIT', Pid, _Cause} = Msg, State) -> Service = erlang:erase(Pid), State2 = handle_check_msg(Msg, Service, State), @@ -341,8 +346,8 @@ handle_info({check_health, Id}, State) -> handle_info({gen_event_EXIT, _, _}, State) -> %% Ring event handler has been removed for some reason; re-register - watch_for_ring_events(), - {noreply, update_avsn(State)}; + RingEventsPid = watch_for_ring_events(), + {noreply, update_avsn(State#state{ring_events_pid=RingEventsPid})}; handle_info(broadcast, State) -> S2 = broadcast(State#state.peers, State), @@ -367,11 +372,24 @@ update_avsn(State) -> State#state { avsn = State#state.avsn + 1 }. watch_for_ring_events() -> + RingEventsPid = riak_core_ring_events:get_pid(), Self = self(), Fn = fun(R) -> gen_server:cast(Self, {ring_update, R}) end, - riak_core_ring_events:add_sup_callback(Fn). + riak_core_ring_events:add_sup_callback(Fn), + case riak_core_ring_events:get_pid() of + P when P == RingEventsPid -> + RingEventsPid; + _ -> + receive + {gen_event_EXIT, _, _} -> + ok + after 100 -> + ok + end, + watch_for_ring_events() + end. delete_service_mref(Id) -> %% Cleanup the monitor if one exists diff --git a/src/riak_core_ring_events.erl b/src/riak_core_ring_events.erl index 72e084cb5..4d31f90d6 100644 --- a/src/riak_core_ring_events.erl +++ b/src/riak_core_ring_events.erl @@ -34,7 +34,8 @@ ring_update/1, force_update/0, ring_sync_update/1, - force_sync_update/0]). + force_sync_update/0, + get_pid/0]). %% gen_event callbacks -export([init/1, handle_event/2, handle_call/2, @@ -81,6 +82,9 @@ force_sync_update() -> ring_sync_update(Ring) -> gen_event:sync_notify(?MODULE, {ring_update, Ring}). +get_pid() -> + whereis(?MODULE). + %% =================================================================== %% gen_event callbacks %% =================================================================== From 33c322dbbaa64b9810b0f27b36b20e6cdf3c5d0c Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 1 Oct 2013 15:09:36 +0900 Subject: [PATCH 2/8] Address review comments --- src/riak_core_node_watcher.erl | 5 +-- src/riak_core_ring_events.erl | 56 ++++++++++++++++++++++++++++++++-- src/riak_core_ring_manager.erl | 4 +++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/riak_core_node_watcher.erl b/src/riak_core_node_watcher.erl index 3699ec331..ab5dc12df 100644 --- a/src/riak_core_node_watcher.erl +++ b/src/riak_core_node_watcher.erl @@ -377,9 +377,9 @@ watch_for_ring_events() -> Fn = fun(R) -> gen_server:cast(Self, {ring_update, R}) end, - riak_core_ring_events:add_sup_callback(Fn), + HandlerName = riak_core_ring_events:add_sup_callback(Fn), case riak_core_ring_events:get_pid() of - P when P == RingEventsPid -> + P when is_pid(P), P == RingEventsPid -> RingEventsPid; _ -> receive @@ -388,6 +388,7 @@ watch_for_ring_events() -> after 100 -> ok end, + riak_core_ring_events:delete_handler(HandlerName, race_cond_retry), watch_for_ring_events() end. diff --git a/src/riak_core_ring_events.erl b/src/riak_core_ring_events.erl index 4d31f90d6..9351d0db6 100644 --- a/src/riak_core_ring_events.erl +++ b/src/riak_core_ring_events.erl @@ -31,6 +31,7 @@ add_callback/1, add_sup_callback/1, add_guarded_callback/1, + delete_handler/2, ring_update/1, force_update/0, ring_sync_update/1, @@ -43,6 +44,10 @@ -record(state, { callback }). +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +-endif. + %% =================================================================== %% API functions %% =================================================================== @@ -60,13 +65,22 @@ add_guarded_handler(Handler, Args) -> riak_core:add_guarded_event_handler(?MODULE, Handler, Args). add_callback(Fn) when is_function(Fn) -> - gen_event:add_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + HandlerName = {?MODULE, make_ref()}, + gen_event:add_handler(?MODULE, HandlerName, [Fn]), + HandlerName. add_sup_callback(Fn) when is_function(Fn) -> - gen_event:add_sup_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + HandlerName = {?MODULE, make_ref()}, + gen_event:add_sup_handler(?MODULE, HandlerName, [Fn]), + HandlerName. add_guarded_callback(Fn) when is_function(Fn) -> - riak_core:add_guarded_event_handler(?MODULE, {?MODULE, make_ref()}, [Fn]). + HandlerName = {?MODULE, make_ref()}, + riak_core:add_guarded_event_handler(?MODULE, HandlerName, [Fn]), + HandlerName. + +delete_handler(HandlerName, ReasonArgs) -> + gen_event:delete_handler(?MODULE, HandlerName, ReasonArgs). force_update() -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), @@ -110,3 +124,39 @@ terminate(_Reason, _State) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. +-ifdef(TEST). + +delete_handler_test_() -> + RingMgr = riak_core_ring_manager, + {setup, + fun() -> + meck:new(RingMgr, [passthrough]), + meck:expect(RingMgr, get_my_ring, + fun() -> {ok, fake_ring_here} end), + ?MODULE:start_link() + end, + fun(_) -> + meck:unload(RingMgr) + end, + [ + fun () -> + Name = ?MODULE:add_sup_callback( + fun(Arg) -> RingMgr:test_dummy_func(Arg) end), + + BogusRing = bogus_ring_stand_in, + ?MODULE:ring_update(BogusRing), + ok = ?MODULE:delete_handler(Name, unused), + {error, _} = ?MODULE:delete_handler(Name, unused), + + %% test_dummy_func is called twice: once by add_sup_callback() + %% and once by ring_update(). + [ + {_, {RingMgr, get_my_ring, _}, _}, + {_, {RingMgr, test_dummy_func, _}, _}, + {_, {RingMgr, test_dummy_func, [BogusRing]}, _} + ] = meck:history(RingMgr), + ok + end + ]}. + +-endif. % TEST diff --git a/src/riak_core_ring_manager.erl b/src/riak_core_ring_manager.erl index 8af0e78f8..0d0c8b268 100644 --- a/src/riak_core_ring_manager.erl +++ b/src/riak_core_ring_manager.erl @@ -97,6 +97,7 @@ }). -export([setup_ets/1, cleanup_ets/1, set_ring_global/1]). %% For EUnit testing +-export([test_dummy_func/1]). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). -endif. @@ -627,6 +628,9 @@ prune_write_ring(Ring, State) -> %% =================================================================== -ifdef(TEST). +test_dummy_func(_Arg) -> + ok. + back_test() -> X = [1,2,3], List1 = [[1,2,3],[4,2,3], [7,8,3], [11,12,13], [1,2,3]], From bbcc3a36f633b7a0da7d0c0844198bda7b41a967 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 1 Oct 2013 15:34:20 +0900 Subject: [PATCH 3/8] Derp, fix non-EUnit compilation --- src/riak_core_ring_manager.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/riak_core_ring_manager.erl b/src/riak_core_ring_manager.erl index 0d0c8b268..96a730d25 100644 --- a/src/riak_core_ring_manager.erl +++ b/src/riak_core_ring_manager.erl @@ -623,14 +623,14 @@ prune_write_ring(Ring, State) -> State2 = set_ring(Ring, State), State2. +test_dummy_func(_Arg) -> + ok. + %% =================================================================== %% Unit tests %% =================================================================== -ifdef(TEST). -test_dummy_func(_Arg) -> - ok. - back_test() -> X = [1,2,3], List1 = [[1,2,3],[4,2,3], [7,8,3], [11,12,13], [1,2,3]], From 01c0ada19c2edf36f13169bc3104df646e2bbcf0 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Fri, 4 Oct 2013 16:36:15 +0900 Subject: [PATCH 4/8] Add poll_for_riak_core_ring_events_pid() --- src/riak_core_node_watcher.erl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/riak_core_node_watcher.erl b/src/riak_core_node_watcher.erl index ab5dc12df..fd5a6c7a1 100644 --- a/src/riak_core_node_watcher.erl +++ b/src/riak_core_node_watcher.erl @@ -372,7 +372,9 @@ update_avsn(State) -> State#state { avsn = State#state.avsn + 1 }. watch_for_ring_events() -> - RingEventsPid = riak_core_ring_events:get_pid(), + %% Polling isn't (and cannot be) perfect, good enough 99% of the time + %% Our supervisor takes care of the last 1%. + RingEventsPid = poll_for_riak_core_ring_events_pid(10), Self = self(), Fn = fun(R) -> gen_server:cast(Self, {ring_update, R}) @@ -392,6 +394,17 @@ watch_for_ring_events() -> watch_for_ring_events() end. +poll_for_riak_core_ring_events_pid(0) -> + undefined; +poll_for_riak_core_ring_events_pid(N) -> + case riak_core_ring_events:get_pid() of + Pid when is_pid(Pid) -> + Pid; + _ -> + timer:sleep(100), + poll_for_riak_core_ring_events_pid(N-1) + end. + delete_service_mref(Id) -> %% Cleanup the monitor if one exists case erlang:get(Id) of From c8986a3708bd50f26dc20bffa6bf4ec4951ad615 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 15 Oct 2013 16:03:26 +0900 Subject: [PATCH 5/8] Undo all changes in this branch to src/riak_core_node_watcher.erl --- src/riak_core_node_watcher.erl | 44 +++++----------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/src/riak_core_node_watcher.erl b/src/riak_core_node_watcher.erl index fd5a6c7a1..59053d262 100644 --- a/src/riak_core_node_watcher.erl +++ b/src/riak_core_node_watcher.erl @@ -44,8 +44,7 @@ -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). --record(state, { ring_events_pid :: pid(), - status = up, +-record(state, { status = up, services = [], health_checks = [], healths_enabled = true, @@ -173,7 +172,7 @@ init([]) -> %% Setup callback notification for ring changes; note that we use the %% supervised variation so that the callback gets removed if this process %% exits - RingEventsPid = watch_for_ring_events(), + watch_for_ring_events(), %% Watch for node up/down events net_kernel:monitor_nodes(true), @@ -181,7 +180,7 @@ init([]) -> %% Setup ETS table to track node status ets:new(?MODULE, [protected, {read_concurrency, true}, named_table]), - {ok, schedule_broadcast(#state{ring_events_pid=RingEventsPid})}. + {ok, schedule_broadcast(#state{})}. handle_call({set_bcast_mod, Module, Fn}, _From, State) -> %% Call available for swapping out how broadcasts are generated @@ -331,10 +330,6 @@ handle_info({'DOWN', Mref, _, _Pid, _Info}, State) -> {noreply, update_avsn(S3)} end; -handle_info({'EXIT', Pid, _Cause}, #state{ring_events_pid=RingEventsPid}=State) - when Pid == RingEventsPid -> - RingEventsPid2 = watch_for_ring_events(), - {noreply, State#state{ring_events_pid=RingEventsPid2}}; handle_info({'EXIT', Pid, _Cause} = Msg, State) -> Service = erlang:erase(Pid), State2 = handle_check_msg(Msg, Service, State), @@ -346,8 +341,8 @@ handle_info({check_health, Id}, State) -> handle_info({gen_event_EXIT, _, _}, State) -> %% Ring event handler has been removed for some reason; re-register - RingEventsPid = watch_for_ring_events(), - {noreply, update_avsn(State#state{ring_events_pid=RingEventsPid})}; + watch_for_ring_events(), + {noreply, update_avsn(State)}; handle_info(broadcast, State) -> S2 = broadcast(State#state.peers, State), @@ -372,38 +367,11 @@ update_avsn(State) -> State#state { avsn = State#state.avsn + 1 }. watch_for_ring_events() -> - %% Polling isn't (and cannot be) perfect, good enough 99% of the time - %% Our supervisor takes care of the last 1%. - RingEventsPid = poll_for_riak_core_ring_events_pid(10), Self = self(), Fn = fun(R) -> gen_server:cast(Self, {ring_update, R}) end, - HandlerName = riak_core_ring_events:add_sup_callback(Fn), - case riak_core_ring_events:get_pid() of - P when is_pid(P), P == RingEventsPid -> - RingEventsPid; - _ -> - receive - {gen_event_EXIT, _, _} -> - ok - after 100 -> - ok - end, - riak_core_ring_events:delete_handler(HandlerName, race_cond_retry), - watch_for_ring_events() - end. - -poll_for_riak_core_ring_events_pid(0) -> - undefined; -poll_for_riak_core_ring_events_pid(N) -> - case riak_core_ring_events:get_pid() of - Pid when is_pid(Pid) -> - Pid; - _ -> - timer:sleep(100), - poll_for_riak_core_ring_events_pid(N-1) - end. + riak_core_ring_events:add_sup_callback(Fn). delete_service_mref(Id) -> %% Cleanup the monitor if one exists From 65ee0a4325ddb05d802a5f3baa6c9451ccff18c5 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Tue, 15 Oct 2013 16:25:08 +0900 Subject: [PATCH 6/8] Change supervisor structure. Broken: ring never settled after join. --- ebin/riak_core.app | 1 + src/riak_core_ring_events_sup.erl | 55 +++++++++++++++++++++++++++++++ src/riak_core_sup.erl | 3 +- 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 src/riak_core_ring_events_sup.erl diff --git a/ebin/riak_core.app b/ebin/riak_core.app index 1bce47b36..d6d2d6d8f 100644 --- a/ebin/riak_core.app +++ b/ebin/riak_core.app @@ -54,6 +54,7 @@ riak_core_repair, riak_core_ring, riak_core_ring_events, + riak_core_ring_events_sup, riak_core_ring_handler, riak_core_ring_manager, riak_core_ring_util, diff --git a/src/riak_core_ring_events_sup.erl b/src/riak_core_ring_events_sup.erl new file mode 100644 index 000000000..d7ebe49e5 --- /dev/null +++ b/src/riak_core_ring_events_sup.erl @@ -0,0 +1,55 @@ +%% ------------------------------------------------------------------- +%% +%% riak_core: Core Riak Application +%% +%% Copyright (c) 2007-2013 Basho Technologies, Inc. All Rights Reserved. +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +-module(riak_core_ring_events_sup). + +-behaviour(supervisor). + +%% API +-export([start_link/0]). + +%% Supervisor callbacks +-export([init/1]). + +%% Helper macro for declaring children of supervisor +-define(CHILD(I, Type, Timeout), {I, {I, start_link, []}, permanent, Timeout, Type, [I]}). +-define(CHILD(I, Type), ?CHILD(I, Type, 5000)). + +%% =================================================================== +%% API functions +%% =================================================================== + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +%% =================================================================== +%% Supervisor callbacks +%% =================================================================== + +init([]) -> + Children = lists:flatten( + [ + ?CHILD(riak_core_ring_events, worker), + ?CHILD(riak_core_node_watcher, worker) + ]), + + {ok, {{one_for_all, 9999, 10}, Children}}. diff --git a/src/riak_core_sup.erl b/src/riak_core_sup.erl index ba6888aed..ddce52350 100644 --- a/src/riak_core_sup.erl +++ b/src/riak_core_sup.erl @@ -50,14 +50,13 @@ init([]) -> [?CHILD(riak_core_sysmon_minder, worker), ?CHILD(riak_core_vnode_sup, supervisor, 305000), ?CHILD(riak_core_eventhandler_sup, supervisor), - ?CHILD(riak_core_ring_events, worker), + ?CHILD(riak_core_ring_events_sup, supervisor), ?CHILD(riak_core_ring_manager, worker), ?CHILD(riak_core_metadata_manager, worker), ?CHILD(riak_core_metadata_hashtree, worker), ?CHILD(riak_core_broadcast, worker), ?CHILD(riak_core_vnode_proxy_sup, supervisor), ?CHILD(riak_core_node_watcher_events, worker), - ?CHILD(riak_core_node_watcher, worker), ?CHILD(riak_core_vnode_manager, worker), ?CHILD(riak_core_capability, worker), ?CHILD(riak_core_handoff_sup, supervisor), From 4560248e1b1307236f2d2c843a59cf68769a9167 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 16 Oct 2013 11:11:16 +0900 Subject: [PATCH 7/8] Fix broken last commit: start riak_core_node_watcher later in supervisor start sequence --- src/riak_core_ring_events_sup.erl | 11 ++++++++--- src/riak_core_vnode_manager.erl | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/riak_core_ring_events_sup.erl b/src/riak_core_ring_events_sup.erl index d7ebe49e5..e9892c4a7 100644 --- a/src/riak_core_ring_events_sup.erl +++ b/src/riak_core_ring_events_sup.erl @@ -25,7 +25,7 @@ -behaviour(supervisor). %% API --export([start_link/0]). +-export([start_link/0, start_late_worker/0]). %% Supervisor callbacks -export([init/1]). @@ -41,6 +41,12 @@ start_link() -> supervisor:start_link({local, ?MODULE}, ?MODULE, []). +late_worker_childspec() -> + ?CHILD(riak_core_node_watcher, worker). + +start_late_worker() -> + supervisor:start_child(?MODULE, late_worker_childspec()). + %% =================================================================== %% Supervisor callbacks %% =================================================================== @@ -48,8 +54,7 @@ start_link() -> init([]) -> Children = lists:flatten( [ - ?CHILD(riak_core_ring_events, worker), - ?CHILD(riak_core_node_watcher, worker) + ?CHILD(riak_core_ring_events, worker) ]), {ok, {{one_for_all, 9999, 10}, Children}}. diff --git a/src/riak_core_vnode_manager.erl b/src/riak_core_vnode_manager.erl index 500285a5a..6c664aacb 100644 --- a/src/riak_core_vnode_manager.erl +++ b/src/riak_core_vnode_manager.erl @@ -206,6 +206,8 @@ get_all_vnodes(Mod) -> %% @private init(_State) -> + {ok, _Pid} = riak_core_ring_events_sup:start_late_worker(), + error_logger:warning_msg("YO started late worker ~p\n", [_Pid]), {ok, Ring, CHBin} = riak_core_ring_manager:get_raw_ring_chashbin(), Mods = [Mod || {_, Mod} <- riak_core:vnode_modules()], State = #state{forwarding=dict:new(), handoff=dict:new(), From 6e4f1566ae1b47206d75160f3c623f91ce8cfaa6 Mon Sep 17 00:00:00 2001 From: Scott Lystig Fritchie Date: Wed, 16 Oct 2013 11:56:40 +0900 Subject: [PATCH 8/8] Fix supervisor server proc ordering problem of previous commit --- src/riak_core_ring_events_sup.erl | 8 ++++---- src/riak_core_sup.erl | 6 ++++++ src/riak_core_vnode_manager.erl | 5 +++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/riak_core_ring_events_sup.erl b/src/riak_core_ring_events_sup.erl index e9892c4a7..16a8e42a4 100644 --- a/src/riak_core_ring_events_sup.erl +++ b/src/riak_core_ring_events_sup.erl @@ -25,7 +25,7 @@ -behaviour(supervisor). %% API --export([start_link/0, start_late_worker/0]). +-export([start_link/0, start_riak_core_node_watcher/0]). %% Supervisor callbacks -export([init/1]). @@ -41,11 +41,11 @@ start_link() -> supervisor:start_link({local, ?MODULE}, ?MODULE, []). -late_worker_childspec() -> +riak_core_node_watcher_childspec() -> ?CHILD(riak_core_node_watcher, worker). -start_late_worker() -> - supervisor:start_child(?MODULE, late_worker_childspec()). +start_riak_core_node_watcher() -> + supervisor:start_child(?MODULE, riak_core_node_watcher_childspec()). %% =================================================================== %% Supervisor callbacks diff --git a/src/riak_core_sup.erl b/src/riak_core_sup.erl index ddce52350..44c4744cf 100644 --- a/src/riak_core_sup.erl +++ b/src/riak_core_sup.erl @@ -57,6 +57,12 @@ init([]) -> ?CHILD(riak_core_broadcast, worker), ?CHILD(riak_core_vnode_proxy_sup, supervisor), ?CHILD(riak_core_node_watcher_events, worker), + %% riak_core_node_watcher is no longer started here. It is + %% now a *dynamic* child of the riak_core_ring_events_sup + %% and started by riak_core_vnode_manager:init/1. If it is + %% started by riak_core_ring_events_sup during its startup, + %% it is too early, and Riak rings never settle after a ring + %% change. ?CHILD(riak_core_vnode_manager, worker), ?CHILD(riak_core_capability, worker), ?CHILD(riak_core_handoff_sup, supervisor), diff --git a/src/riak_core_vnode_manager.erl b/src/riak_core_vnode_manager.erl index 6c664aacb..6cbe8b4df 100644 --- a/src/riak_core_vnode_manager.erl +++ b/src/riak_core_vnode_manager.erl @@ -206,8 +206,9 @@ get_all_vnodes(Mod) -> %% @private init(_State) -> - {ok, _Pid} = riak_core_ring_events_sup:start_late_worker(), - error_logger:warning_msg("YO started late worker ~p\n", [_Pid]), + %% See riak_core_sup:init/1 for why this call is here. + {ok, _Pid} = riak_core_ring_events_sup:start_riak_core_node_watcher(), + {ok, Ring, CHBin} = riak_core_ring_manager:get_raw_ring_chashbin(), Mods = [Mod || {_, Mod} <- riak_core:vnode_modules()], State = #state{forwarding=dict:new(), handoff=dict:new(),