Skip to content
This repository

Add retry on eleveldb lock errors during open for up to 1 minute. #395

Merged
merged 1 commit into from almost 2 years ago

2 participants

Jon Meredith Reid Draper
Jon Meredith

If a vnode crashes, the eleveldb NIF will close the leveldb database cleanly, writing out any pending data. However, the vnode supervisor can restart the crashed vnode faster than the close process and trigger a lock error which fails the vnode start and takes riak down.

This patch adds a retry every two seconds, waiting up to a minute for the lock to clear.

Jon Meredith jonmeredith Add retry on eleveldb lock errors during open for up to 1 minute.
If a vnode crashes then the eleveldb NIF will close the leveldb database
cleanly, writing out any pending data.  The vnode supervisor can restart
the crashed vnode faster than the close process and trigger a lock error.

This patch adds a retry every two seconds, waiting up to a minute
for the lock to clear.
5032740
Jon Meredith

Here's an on the console way to play with it...

lager:set_loglevel(lager_console_backend, debug).
Bytes = crypto:rand_bytes(1000000).
{ok,C}=riak:local_client().
F = fun(X) -> C:put(riak_object:new(<<"b">>,<<"k">>,Bytes)), X(X) end. 
[spawn(fun() -> F(F) end) || I <- lists:seq(1,10)].
timer:sleep(10000),
rpc:multicall(erlang,
                 apply,
                 [fun() ->
                         [ 
                          exit(Pid, kill) ||
                              {_,_,Pid} <-
                                  riak_core_vnode_manager:all_vnodes(riak_kv_vnode),
                              is_pid(Pid)
                         ]
                  end,
                  []]).

Reid Draper

don't think you need the \n with lager

It'll add one if I don't, so why make it do extra work :)

Collaborator

fair enough

Reid Draper

how about a lager:debug here to say we've exhausted our retries?

Could do - I deliberately pass back the last error I got, to I thought that would show up in logs... a little extra debug couldn't hurt.

Collaborator

Good point, nbd either way then

Reid Draper
Collaborator

Nice tests. Looks good other than the two comments above.

Jon Meredith jonmeredith merged commit 1360c26 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 1 unique commit by 1 author.

Sep 14, 2012
Jon Meredith jonmeredith Add retry on eleveldb lock errors during open for up to 1 minute.
If a vnode crashes then the eleveldb NIF will close the leveldb database
cleanly, writing out any pending data.  The vnode supervisor can restart
the crashed vnode faster than the close process and trigger a lock error.

This patch adds a retry every two seconds, waiting up to a minute
for the lock to clear.
5032740
This page is out of date. Refresh to see the latest.

Showing 1 changed file with 116 additions and 1 deletion. Show diff stats Hide diff stats

  1. +116 1 src/riak_kv_eleveldb_backend.erl
117 src/riak_kv_eleveldb_backend.erl
@@ -368,10 +368,31 @@ init_state(DataRoot, Config) ->
368 368 config = FinalConfig }.
369 369
370 370 %% @private
371   -open_db(State0) ->
  371 +open_db(State) ->
  372 + RetriesLeft = app_helper:get_env(riak_kv, eleveldb_open_retries, 30),
  373 + open_db(State, max(1, RetriesLeft), undefined).
  374 +
  375 +open_db(_State0, 0, LastError) ->
  376 + {error, LastError};
  377 +open_db(State0, RetriesLeft, _) ->
372 378 case eleveldb:open(State0#state.data_root, State0#state.open_opts) of
373 379 {ok, Ref} ->
374 380 {ok, State0#state { ref = Ref }};
  381 + %% Check specifically for lock error, this can be caused if
  382 + %% a crashed vnode takes some time to flush leveldb information
  383 + %% out to disk. The process is gone, but the NIF resource cleanup
  384 + %% may not have completed.
  385 + {error, {db_open, OpenErr}=Reason} ->
  386 + case lists:prefix("IO error: lock ", OpenErr) of
  387 + true ->
  388 + SleepFor = app_helper:get_env(riak_kv, eleveldb_open_retry_delay, 2000),
  389 + lager:debug("Leveldb backend retrying ~p in ~p ms after error ~s\n",
  390 + [State0#state.data_root, SleepFor, OpenErr]),
  391 + timer:sleep(SleepFor),
  392 + open_db(State0, RetriesLeft - 1, Reason);
  393 + false ->
  394 + {error, Reason}
  395 + end;
375 396 {error, Reason} ->
376 397 {error, Reason}
377 398 end.
@@ -545,6 +566,100 @@ custom_config_test_() ->
545 566 application:set_env(eleveldb, data_root, ""),
546 567 riak_kv_backend:standard_test(?MODULE, [{data_root, "test/eleveldb-backend"}]).
547 568
  569 +retry_test() ->
  570 + Root = "/tmp/eleveldb_retry_test",
  571 + try
  572 + {ok, State1} = start(42, [{data_root, Root}]),
  573 + Me = self(),
  574 + Pid1 = spawn_link(fun() ->
  575 + receive
  576 + stop ->
  577 + Me ! {1, stop(State1)}
  578 + end
  579 + end),
  580 + _Pid2 = spawn_link(
  581 + fun() ->
  582 + Me ! {2, running},
  583 + Me ! {2, start(42, [{data_root, Root}])}
  584 + end),
  585 + %% Ensure Pid2 is runnng and give it 10ms to get into the open
  586 + %% so we know it has a lock clash
  587 + receive
  588 + {2, running} ->
  589 + timer:sleep(10);
  590 + X ->
  591 + throw({unexpected, X})
  592 + after
  593 + 5000 ->
  594 + throw(timeout1)
  595 + end,
  596 + %% Tell Pid1 to shut it down
  597 + Pid1 ! stop,
  598 + receive
  599 + {1, ok} ->
  600 + ok;
  601 + X2 ->
  602 + throw({unexpected, X2})
  603 + after
  604 + 5000 ->
  605 + throw(timeout2)
  606 + end,
  607 + %% Wait for Pid2
  608 + receive
  609 + {2, {ok, _State2}} ->
  610 + ok;
  611 + {2, Res} ->
  612 + throw({notok, Res});
  613 + X3 ->
  614 + throw({unexpected, X3})
  615 + end
  616 + after
  617 + os:cmd("rm -rf " ++ Root)
  618 + end.
  619 +
  620 +retry_fail_test() ->
  621 + Root = "/tmp/eleveldb_fail_retry_test",
  622 + try
  623 + application:set_env(riak_kv, eleveldb_open_retries, 3), % 3 times, 1ms a time
  624 + application:set_env(riak_kv, eleveldb_open_retry_delay, 1),
  625 + {ok, State1} = start(42, [{data_root, Root}]),
  626 + Me = self(),
  627 + spawn_link(
  628 + fun() ->
  629 + Me ! {2, running},
  630 + Me ! {2, start(42, [{data_root, Root}])}
  631 + end),
  632 + %% Ensure Pid2 is runnng and give it 10ms to get into the open
  633 + %% so we know it has a lock clash
  634 + receive
  635 + {2, running} ->
  636 + ok;
  637 + X ->
  638 + throw({unexpected, X})
  639 + after
  640 + 5000 ->
  641 + throw(timeout1)
  642 + end,
  643 + %% Wait for Pid2 to fail
  644 + receive
  645 + {2, {error, {db_open, _Why}}} ->
  646 + ok;
  647 + {2, Res} ->
  648 + throw({expect_fail, Res});
  649 + X3 ->
  650 + throw({unexpected, X3})
  651 + end,
  652 + %% Then close and reopen, just for kicks to prove it was the locking
  653 + ok = stop(State1),
  654 + {ok, State2} = start(42, [{data_root, Root}]),
  655 + ok = stop(State2)
  656 + after
  657 + os:cmd("rm -rf " ++ Root),
  658 + application:unset_env(riak_kv, eleveldb_open_retries),
  659 + application:unset_env(riak_kv, eleveldb_open_retry_delay)
  660 + end.
  661 +
  662 +
548 663 -ifdef(EQC).
549 664
550 665 eqc_test_() ->

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.