Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

assume handoff if vnode exits 'normal' during queue requeuest #64

Closed
wants to merge 1 commit into from

2 participants

@beerriot

The pipe_verify_handoff_blocking test in basho/riak_test#154 demonstrated
that the monitor used by riak_pipe_vnode:queue_work_send may fire while
handoff is taking place. This situation is further described in
#63.

This PR addresses the problem by first checking whether the Reason
for the vnode's exit was normal. If it was not normal, the
vnode_down error is returned as before. If it was normal, then it is
assumed that handoff happened, and the monitor is moved to the new
vnode, as indicated by riak_core_ring:next_owner/2.

This PR targets bwf-riak_test, because commits in that branch are needed to run the riak_test proof, but I wanted this PR to be clear about what it was.

@beerriot beerriot assume handoff if vnode exits 'normal' during queue requeuest
The pipe_verify_handoff_blocking test in riak_test PR 154 demonstrated
that the monitor used by riak_pipe_vnode:queue_work_send may fire while
handoff is taking place. This situation is further described in
riak_pipe issue 63.

This commit addresses the problem by first checking whether the Reason
for the vnode's exit was 'normal'. If it was not 'normal', the
vnode_down error is returned as before. If it was 'normal', then it is
assumed that handoff happened, and the monitor is moved to the new
vnode, as indicated by riak_core_ring:next_owner/2.
fd72f55
@beerriot beerriot closed this
@russelldb
Owner

re-opened as #68

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 13, 2013
  1. @beerriot

    assume handoff if vnode exits 'normal' during queue requeuest

    beerriot authored
    The pipe_verify_handoff_blocking test in riak_test PR 154 demonstrated
    that the monitor used by riak_pipe_vnode:queue_work_send may fire while
    handoff is taking place. This situation is further described in
    riak_pipe issue 63.
    
    This commit addresses the problem by first checking whether the Reason
    for the vnode's exit was 'normal'. If it was not 'normal', the
    vnode_down error is returned as before. If it was 'normal', then it is
    assumed that handoff happened, and the monitor is moved to the new
    vnode, as indicated by riak_core_ring:next_owner/2.
This page is out of date. Refresh to see the latest.
Showing with 36 additions and 11 deletions.
  1. +36 −11 src/riak_pipe_vnode.erl
View
47 src/riak_pipe_vnode.erl
@@ -335,17 +335,7 @@ queue_work_send(#fitting{ref=Ref}=Fitting,
{raw, Ref, self()},
riak_pipe_vnode_master) of
{ok, VnodePid} ->
- %% monitor in case the vnode is gone before it
- %% responds to this request
- MonRef = erlang:monitor(process, VnodePid),
- %% block until input confirmed queued, for backpressure
- receive
- {Ref, Reply} ->
- erlang:demonitor(MonRef),
- Reply;
- {'DOWN',MonRef,process,VnodePid,Reason} ->
- {error, {vnode_down, Reason}}
- end;
+ queue_work_wait(Ref, Index, Node, VnodePid);
{error, timeout} ->
{error, {vnode_proxy_timeout, {Index, Node}}}
catch exit:{{nodedown, Node}, _GenServerCall} ->
@@ -353,6 +343,41 @@ queue_work_send(#fitting{ref=Ref}=Fitting,
{error, {nodedown, Node}}
end.
+queue_work_wait(Ref, Index, Node, VnodePid) ->
+ %% monitor in case the vnode is gone before it
+ %% responds to this request
+ MonRef = erlang:monitor(process, VnodePid),
+ %% block until input confirmed queued, for backpressure
+ receive
+ {Ref, Reply} ->
+ erlang:demonitor(MonRef),
+ Reply;
+ {'DOWN',MonRef,process,VnodePid,normal} ->
+ %% the vnode likely just shut down after completing handoff
+ {ok, Ring} = riak_core_ring_manager:get_my_ring(),
+ case riak_core_ring:next_owner(Ring, Index) of
+ {undefined, undefined, undefined} ->
+ %% ownership finished changing before we asked
+ %% ... check if Next==Node?
+ Next = riak_core_ring:index_owner(Ring);
+ {Node, Next, _Status} ->
+ %% ownership is still changing ... should this be
+ %% looser, and not care whether the transfer was
+ %% from Node?
+ ok
+ end,
+ %% monitor new vnode, since the input will be handled
+ %% there, instead of at the vnode originally contacted
+ {ok, NextPid} = rpc:call(Next,
+ riak_core_vnode_master,
+ get_vnode_pid,
+ [Index, riak_pipe_vnode]),
+ queue_work_wait(Ref, Index, Node, NextPid);
+ {'DOWN',MonRef,process,VnodePid,Reason} ->
+ %% the vnode died unexpectedly
+ {error, {vnode_down, Reason}}
+ end.
+
%% @doc Send end-of-inputs for a fitting to a vnode. Note: this
%% should only be called by `riak_pipe_fitting' processes. This
%% will cause the vnode to shutdown the worker, dispose of the
Something went wrong with that request. Please try again.