Skip to content
This repository

Add support to vnode master to wait until related service has started #161

Merged
merged 2 commits into from about 2 years ago

2 participants

Joseph Blomstedt Jon Meredith
Joseph Blomstedt
Owner

This patch changes riak_core_vnode_master so that the vnode master will wait until the related service has started before handling requests. This is necessary in legacy_vnode_routing mode because the routing proxies may not be available until the service has fully started. To actually enable waiting, vnode services must change how they spawn the vnode master, passing in the name of the related service. A separate pull-request makes this change to riak_kv.

This change (and the related riak_kv change) is designed to address issue #155.

Joseph Blomstedt
Owner

A reproducible test has been added to riak_test as well. It requires changes introduced in the first commit of this pull-request in order to run. Those changes allow Riak to have a delayed startup, which the tests uses to mock the vnode proxy startup process to introduce additional delay that makes the race condition we're fixing reproducible.

To verify changes. Setup and run the gh_riak_core_155 riak_test.

Running against basho/riak_core@67aaf78 and all other deps at 1.1 and the test should fail.

Running against 1.1 using this branch for riak_core and gh-riak-core-155-kv for riak_kv, and the test should pass.

Example expected failure output:

$ ./riak_test rtdev.config gh_riak_core_155
14:13:14.596 [info] Application lager started on node 'riak_test@127.0.0.1'
14:13:14.599 [info] Riak path: "/tmp/rt"
14:13:14.600 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:13:14.828 [info] Resetting nodes to fresh state
14:13:14.828 [debug] Running: git --git-dir="/tmp/rt/dev/.git" --work-tree="/tmp/rt/dev" reset HEAD --hard
14:13:14.858 [debug] Running: git --git-dir="/tmp/rt/dev/.git" --work-tree="/tmp/rt/dev" clean -fd
14:13:14.879 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:13:16.626 [debug] Supervisor inet_gethost_native_sup started undefined at pid <0.60.0>
14:13:16.627 [debug] Supervisor kernel_safe_sup started inet_gethost_native:start_link() at pid <0.59.0>
14:13:16.640 [info] Deployed nodes: ['dev1@127.0.0.1']
14:13:16.642 [info] Cluster built: ['dev1@127.0.0.1']
14:13:16.648 [info] Adding delayed start to app.config
14:13:16.649 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:13:18.889 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:13:21.975 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:13:24.207 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:13:25.382 [info] Installed mocks to delay riak_kv proxy startup
14:13:25.382 [info] Issuing 10000 gets against 'dev1@127.0.0.1'
14:13:27.295 [info] Verifying 'dev1@127.0.0.1' has not crashed
escript: exception error: {assertEqual_failed,[{module,gh_riak_core_155},
                                      {line,35},
                                      {expression,"net_adm : ping ( Node )"},
                                      {expected,pong},
                                      {value,pang}]}
  in function  gh_riak_core_155:'-gh_riak_core_155/0-fun-0-'/2
  in call from gh_riak_core_155:'-gh_riak_core_155/0-lc$^0/1-0-'/2
  in call from gh_riak_core_155:gh_riak_core_155/0
  in call from riak_test:main/1
  in call from escript:run/2
  in call from escript:start/1
  in call from init:start_it/1
  in call from init:start_em/1

Example expected success output:

$ ./riak_test rtdev.config gh_riak_core_155
14:14:12.895 [info] Application lager started on node 'riak_test@127.0.0.1'
14:14:12.897 [info] Riak path: "/tmp/rt"
14:14:12.898 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:14:13.119 [info] Resetting nodes to fresh state
14:14:13.119 [debug] Running: git --git-dir="/tmp/rt/dev/.git" --work-tree="/tmp/rt/dev" reset HEAD --hard
14:14:13.145 [debug] Running: git --git-dir="/tmp/rt/dev/.git" --work-tree="/tmp/rt/dev" clean -fd
14:14:13.160 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:14:14.906 [debug] Supervisor inet_gethost_native_sup started undefined at pid <0.60.0>
14:14:14.907 [debug] Supervisor kernel_safe_sup started inet_gethost_native:start_link() at pid <0.59.0>
14:14:14.920 [info] Deployed nodes: ['dev1@127.0.0.1']
14:14:14.922 [info] Cluster built: ['dev1@127.0.0.1']
14:14:14.930 [info] Adding delayed start to app.config
14:14:14.930 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:14:17.161 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:14:20.270 [info] Running: /tmp/rt/dev/dev1/bin/riak stop
14:14:22.503 [info] Running: /tmp/rt/dev/dev1/bin/riak start
14:14:23.667 [info] Installed mocks to delay riak_kv proxy startup
14:14:23.667 [info] Issuing 10000 gets against 'dev1@127.0.0.1'
14:14:25.590 [info] Verifying 'dev1@127.0.0.1' has not crashed
14:14:35.601 [info] Test passed
Jon Meredith
Owner

Should there be a pr for search too?

Joseph Blomstedt
Owner

It's unclear to me if the search and pipe code actually ever send requests to a node before the service is marked as up. I always figured it was a get/put FSM issue, where we check the service when computing preflists, but then don't recheck when sending messages. Ie. the node could restart in the middle and have the kv service currently down and proxies not yet started.

But, for safety sake, we can certainly add the one-line change to search, pipe, etc.

Jon Meredith
Owner

+1 merge

Joseph Blomstedt jtuple merged commit 3119736 into from April 04, 2012
Joseph Blomstedt jtuple closed this April 04, 2012
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
8  src/riak_core_app.erl
@@ -36,6 +36,14 @@ start(_StartType, _StartArgs) ->
36 36
     %% riak_core_sysmon_minder start it, because that process can act
37 37
     %% on any handler crash notification, whereas we cannot.
38 38
 
  39
+    case application:get_env(riak_core, delayed_start) of
  40
+        {ok, Delay} ->
  41
+            lager:info("Delaying riak_core startup as requested"),
  42
+            timer:sleep(Delay);
  43
+        _ ->
  44
+            ok
  45
+    end,
  46
+
39 47
     %% Validate that the ring state directory exists
40 48
     riak_core_util:start_app_deps(riak_core),
41 49
     RingStateDir = app_helper:get_env(riak_core, ring_state_dir),
19  src/riak_core_vnode_master.erl
@@ -25,7 +25,7 @@
25 25
 -module(riak_core_vnode_master).
26 26
 -include("riak_core_vnode.hrl").
27 27
 -behaviour(gen_server).
28  
--export([start_link/1, start_link/2, get_vnode_pid/2,
  28
+-export([start_link/1, start_link/2, start_link/3, get_vnode_pid/2,
29 29
          start_vnode/2, command/3, command/4, sync_command/3,
30 30
          coverage/5,
31 31
          command_return_vnode/4,
@@ -50,9 +50,12 @@ start_link(VNodeMod) ->
50 50
     start_link(VNodeMod, undefined).
51 51
 
52 52
 start_link(VNodeMod, LegacyMod) ->
  53
+    start_link(VNodeMod, LegacyMod, undefined).
  54
+
  55
+start_link(VNodeMod, LegacyMod, Service) ->
53 56
     RegName = reg_name(VNodeMod),
54 57
     gen_server:start_link({local, RegName}, ?MODULE,
55  
-                          [VNodeMod,LegacyMod,RegName], []).
  58
+                          [Service,VNodeMod,LegacyMod,RegName], []).
56 59
 
57 60
 start_vnode(Index, VNodeMod) ->
58 61
     riak_core_vnode_manager:start_vnode(Index, VNodeMod).
@@ -152,7 +155,8 @@ all_nodes(VNodeMod) ->
152 155
     [Pid || {_Mod, _Idx, Pid} <- VNodes].
153 156
 
154 157
 %% @private
155  
-init([VNodeMod, LegacyMod, _RegName]) ->
  158
+init([Service, VNodeMod, LegacyMod, _RegName]) ->
  159
+    gen_server:cast(self(), {wait_for_service, Service}),
156 160
     {ok, #state{idxtab=undefined,
157 161
                 vnode_mod=VNodeMod,
158 162
                 legacy=LegacyMod}}.
@@ -178,6 +182,15 @@ do_proxy_cast({VMaster, Node}, Req=?COVERAGE_REQ{index=Idx}) ->
178 182
 do_proxy_cast({VMaster, Node}, Other) ->
179 183
     gen_server:cast({VMaster, Node}, Other).
180 184
 
  185
+handle_cast({wait_for_service, Service}, State) ->
  186
+    case Service of
  187
+        undefined ->
  188
+            ok;
  189
+        _ ->
  190
+            lager:debug("Waiting for service: ~p", [Service]),
  191
+            riak_core:wait_for_service(Service)
  192
+    end,
  193
+    {noreply, State};
181 194
 handle_cast(Req=?VNODE_REQ{index=Idx}, State=#state{vnode_mod=Mod}) ->
182 195
     Proxy = riak_core_vnode_proxy:reg_name(Mod, Idx),
183 196
     gen_fsm:send_event(Proxy, Req),
1  src/riak_core_vnode_proxy_sup.erl
@@ -47,6 +47,7 @@ stop_proxy(Mod, Index) ->
47 47
     ok.
48 48
 
49 49
 start_proxies(Mod) ->
  50
+    lager:debug("Starting vnode proxies for: ~p", [Mod]),
50 51
     Indices = get_indices(),
51 52
     [start_proxy(Mod, Index) || Index <- Indices],
52 53
     ok.
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.