Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Add retry on eleveldb lock errors during open for up to 1 minute. #395

Merged
merged 1 commit into from

2 participants

@jonmeredith
Owner

If a vnode crashes, the eleveldb NIF will close the leveldb database cleanly, writing out any pending data. However, the vnode supervisor can restart the crashed vnode faster than the close process and trigger a lock error which fails the vnode start and takes riak down.

This patch adds a retry every two seconds, waiting up to a minute for the lock to clear.

@jonmeredith jonmeredith Add retry on eleveldb lock errors during open for up to 1 minute.
If a vnode crashes then the eleveldb NIF will close the leveldb database
cleanly, writing out any pending data.  The vnode supervisor can restart
the crashed vnode faster than the close process and trigger a lock error.

This patch adds a retry every two seconds, waiting up to a minute
for the lock to clear.
5032740
@jonmeredith
Owner

Here's an on the console way to play with it...

lager:set_loglevel(lager_console_backend, debug).
Bytes = crypto:rand_bytes(1000000).
{ok,C}=riak:local_client().
F = fun(X) -> C:put(riak_object:new(<<"b">>,<<"k">>,Bytes)), X(X) end. 
[spawn(fun() -> F(F) end) || I <- lists:seq(1,10)].
timer:sleep(10000),
rpc:multicall(erlang,
                 apply,
                 [fun() ->
                         [ 
                          exit(Pid, kill) ||
                              {_,_,Pid} <-
                                  riak_core_vnode_manager:all_vnodes(riak_kv_vnode),
                              is_pid(Pid)
                         ]
                  end,
                  []]).

@reiddraper reiddraper was assigned
@reiddraper

don't think you need the \n with lager

It'll add one if I don't, so why make it do extra work :)

fair enough

@reiddraper

how about a lager:debug here to say we've exhausted our retries?

Could do - I deliberately pass back the last error I got, to I thought that would show up in logs... a little extra debug couldn't hurt.

Good point, nbd either way then

@reiddraper

Nice tests. Looks good other than the two comments above.

@jonmeredith jonmeredith merged commit 1360c26 into 1.2

1 check passed

Details default The Travis build passed
@seancribbs seancribbs deleted the jdm-eleveldb-retry-on-lock branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Sep 14, 2012
  1. @jonmeredith

    Add retry on eleveldb lock errors during open for up to 1 minute.

    jonmeredith authored
    If a vnode crashes then the eleveldb NIF will close the leveldb database
    cleanly, writing out any pending data.  The vnode supervisor can restart
    the crashed vnode faster than the close process and trigger a lock error.
    
    This patch adds a retry every two seconds, waiting up to a minute
    for the lock to clear.
This page is out of date. Refresh to see the latest.
Showing with 116 additions and 1 deletion.
  1. +116 −1 src/riak_kv_eleveldb_backend.erl
View
117 src/riak_kv_eleveldb_backend.erl
@@ -368,10 +368,31 @@ init_state(DataRoot, Config) ->
config = FinalConfig }.
%% @private
-open_db(State0) ->
+open_db(State) ->
+ RetriesLeft = app_helper:get_env(riak_kv, eleveldb_open_retries, 30),
+ open_db(State, max(1, RetriesLeft), undefined).
+
+open_db(_State0, 0, LastError) ->
+ {error, LastError};
+open_db(State0, RetriesLeft, _) ->
case eleveldb:open(State0#state.data_root, State0#state.open_opts) of
{ok, Ref} ->
{ok, State0#state { ref = Ref }};
+ %% Check specifically for lock error, this can be caused if
+ %% a crashed vnode takes some time to flush leveldb information
+ %% out to disk. The process is gone, but the NIF resource cleanup
+ %% may not have completed.
+ {error, {db_open, OpenErr}=Reason} ->
+ case lists:prefix("IO error: lock ", OpenErr) of
+ true ->
+ SleepFor = app_helper:get_env(riak_kv, eleveldb_open_retry_delay, 2000),
+ lager:debug("Leveldb backend retrying ~p in ~p ms after error ~s\n",
+ [State0#state.data_root, SleepFor, OpenErr]),
+ timer:sleep(SleepFor),
+ open_db(State0, RetriesLeft - 1, Reason);
+ false ->
+ {error, Reason}
+ end;
{error, Reason} ->
{error, Reason}
end.
@@ -545,6 +566,100 @@ custom_config_test_() ->
application:set_env(eleveldb, data_root, ""),
riak_kv_backend:standard_test(?MODULE, [{data_root, "test/eleveldb-backend"}]).
+retry_test() ->
+ Root = "/tmp/eleveldb_retry_test",
+ try
+ {ok, State1} = start(42, [{data_root, Root}]),
+ Me = self(),
+ Pid1 = spawn_link(fun() ->
+ receive
+ stop ->
+ Me ! {1, stop(State1)}
+ end
+ end),
+ _Pid2 = spawn_link(
+ fun() ->
+ Me ! {2, running},
+ Me ! {2, start(42, [{data_root, Root}])}
+ end),
+ %% Ensure Pid2 is runnng and give it 10ms to get into the open
+ %% so we know it has a lock clash
+ receive
+ {2, running} ->
+ timer:sleep(10);
+ X ->
+ throw({unexpected, X})
+ after
+ 5000 ->
+ throw(timeout1)
+ end,
+ %% Tell Pid1 to shut it down
+ Pid1 ! stop,
+ receive
+ {1, ok} ->
+ ok;
+ X2 ->
+ throw({unexpected, X2})
+ after
+ 5000 ->
+ throw(timeout2)
+ end,
+ %% Wait for Pid2
+ receive
+ {2, {ok, _State2}} ->
+ ok;
+ {2, Res} ->
+ throw({notok, Res});
+ X3 ->
+ throw({unexpected, X3})
+ end
+ after
+ os:cmd("rm -rf " ++ Root)
+ end.
+
+retry_fail_test() ->
+ Root = "/tmp/eleveldb_fail_retry_test",
+ try
+ application:set_env(riak_kv, eleveldb_open_retries, 3), % 3 times, 1ms a time
+ application:set_env(riak_kv, eleveldb_open_retry_delay, 1),
+ {ok, State1} = start(42, [{data_root, Root}]),
+ Me = self(),
+ spawn_link(
+ fun() ->
+ Me ! {2, running},
+ Me ! {2, start(42, [{data_root, Root}])}
+ end),
+ %% Ensure Pid2 is runnng and give it 10ms to get into the open
+ %% so we know it has a lock clash
+ receive
+ {2, running} ->
+ ok;
+ X ->
+ throw({unexpected, X})
+ after
+ 5000 ->
+ throw(timeout1)
+ end,
+ %% Wait for Pid2 to fail
+ receive
+ {2, {error, {db_open, _Why}}} ->
+ ok;
+ {2, Res} ->
+ throw({expect_fail, Res});
+ X3 ->
+ throw({unexpected, X3})
+ end,
+ %% Then close and reopen, just for kicks to prove it was the locking
+ ok = stop(State1),
+ {ok, State2} = start(42, [{data_root, Root}]),
+ ok = stop(State2)
+ after
+ os:cmd("rm -rf " ++ Root),
+ application:unset_env(riak_kv, eleveldb_open_retries),
+ application:unset_env(riak_kv, eleveldb_open_retry_delay)
+ end.
+
+
-ifdef(EQC).
eqc_test_() ->
Something went wrong with that request. Please try again.