Skip to content

Commit

Permalink
When shard splitting make sure to reset the targets before any retries
Browse files Browse the repository at this point in the history
Previously the target was reset only when the whole job started, but not when
the initial copy phase restarted on its own. If that happened, we left the
target around so the retry failed always with the `eexist` error.

Target reset has a check to make sure the shards are not in the global shard
map, in case someone manually added them, for example. If they are found there
the job panics and exists.
  • Loading branch information
nickva committed Jan 10, 2020
1 parent 24201a3 commit dd1b281
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 15 deletions.
14 changes: 4 additions & 10 deletions src/mem3/src/mem3_reshard_job.erl
Original file line number Diff line number Diff line change
Expand Up @@ -184,19 +184,12 @@ run(#job{split_state = CurrState} = Job) ->


set_start_state(#job{split_state = State} = Job) ->
case {State, maps:get(State, ?STATE_RESTART, undefined)} of
{_, undefined} ->
case maps:get(State, ?STATE_RESTART, undefined) of
undefined ->
Fmt1 = "~p recover : unknown state ~s",
couch_log:error(Fmt1, [?MODULE, jobfmt(Job)]),
erlang:error({invalid_split_job_recover_state, Job});
{initial_copy, initial_copy} ->
% Since we recover from initial_copy to initial_copy, we need
% to reset the target state as initial_copy expects to
% create a new target
Fmt2 = "~p recover : resetting target ~s",
couch_log:notice(Fmt2, [?MODULE, jobfmt(Job)]),
reset_target(Job);
{_, StartState} ->
StartState->
Job#job{split_state = StartState}
end.

Expand Down Expand Up @@ -403,6 +396,7 @@ initial_copy_impl(#job{source = Source, target = Targets0} = Job) ->
LogMsg1 = "~p initial_copy started ~s",
LogArgs1 = [?MODULE, shardsstr(Source, Targets0)],
couch_log:notice(LogMsg1, LogArgs1),
reset_target(Job),
case couch_db_split:split(SourceName, TMap, fun pickfun/3) of
{ok, Seq} ->
LogMsg2 = "~p initial_copy of ~s finished @ seq:~p",
Expand Down
6 changes: 1 addition & 5 deletions src/mem3/test/eunit/mem3_reshard_test.erl
Original file line number Diff line number Diff line change
Expand Up @@ -453,19 +453,15 @@ target_reset_in_initial_copy(#{db1 := Db}) ->
job_state = running,
split_state = initial_copy
},
BogusParent = spawn(fun() -> receive {ack, _, _} -> ok end end),
put('$ancestors', [BogusParent]), % make prock_lib:ack not blow up
meck:expect(mem3_reshard, checkpoint, 2, ok),
meck:expect(couch_db_split, cleanup_target, 2, ok),
meck:expect(couch_server, exists, fun
(<<"t1">>) -> true;
(<<"t2">>) -> true;
(DbName) -> meck:passthrough([DbName])
end),
JobPid = spawn(fun() -> mem3_reshard_job:init(Job) end),
JobPid = spawn(fun() -> mem3_reshard_job:initial_copy_impl(Job) end),
meck:wait(2, couch_db_split, cleanup_target, ['_', '_'], 5000),
exit(JobPid, kill),
exit(BogusParent, kill),
?assertEqual(2, meck:num_calls(couch_db_split, cleanup_target, 2))
end)}.

Expand Down

0 comments on commit dd1b281

Please sign in to comment.