From 89c5bc9a243d7f0ba8e43b52b9addadccdb7d1ca Mon Sep 17 00:00:00 2001 From: Hans Bolinder Date: Thu, 10 Apr 2014 15:23:44 +0200 Subject: [PATCH] Fix rare race condition in Dets The correction is due to the the evil testcase dets_SUITE:simultaneous_open(). If the process repairing a Dets file is killed (should normally never happen), and another process tries to repair the file, a temporary file from the first process could live on for a while, even after a successful call to file:delete(). This has only been seen on W-nd-ows, where it is a known problem. There are other ways to deal with the problem (rename the file; use some other filename), but we continue using one certain filename in order to be as backwards compatible as possible. --- lib/stdlib/src/dets.erl | 25 +++--- lib/stdlib/src/dets_server.erl | 18 ++++- lib/stdlib/test/dets_SUITE.erl | 138 ++++++++++++++++++++++++--------- 3 files changed, 133 insertions(+), 48 deletions(-) diff --git a/lib/stdlib/src/dets.erl b/lib/stdlib/src/dets.erl index 76e03bbfaaf8..a4bd45ea1910 100644 --- a/lib/stdlib/src/dets.erl +++ b/lib/stdlib/src/dets.erl @@ -2839,17 +2839,22 @@ fsck_try(Fd, Tab, FH, Fname, SlotNumbers, Version) -> tempfile(Fname) -> Tmp = lists:concat([Fname, ".TMP"]), - tempfile(Tmp, 10). - -tempfile(Tmp, 0) -> - Tmp; -tempfile(Tmp, N) -> case file:delete(Tmp) of - {error, eacces} -> % 'dets_process_died' happened anyway... (W-nd-ws) - timer:sleep(1000), - tempfile(Tmp, N-1); - _ -> - Tmp + {error, _Reason} -> % typically enoent + ok; + ok -> + assure_no_file(Tmp) + end, + Tmp. + +assure_no_file(File) -> + case file:read_file_info(File) of + {ok, _FileInfo} -> + %% Wait for some other process to close the file: + timer:sleep(100), + assure_no_file(File); + {error, _} -> + ok end. %% -> {ok, NewHead} | {try_again, integer()} | Error diff --git a/lib/stdlib/src/dets_server.erl b/lib/stdlib/src/dets_server.erl index 268c20104768..3164d40f3570 100644 --- a/lib/stdlib/src/dets_server.erl +++ b/lib/stdlib/src/dets_server.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2001-2013. All Rights Reserved. +%% Copyright Ericsson AB 2001-2014. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -171,9 +171,15 @@ handle_info({pending_reply, {Ref, Result0}}, State) -> link(Pid), do_link(Store, FromPid), true = ets:insert(Store, {FromPid, Tab}), - true = ets:insert(?REGISTRY, {Tab, 1, Pid}), - true = ets:insert(?OWNERS, {Pid, Tab}), + %% do_internal_open() has already done the following: + %% true = ets:insert(?REGISTRY, {Tab, 1, Pid}), + %% true = ets:insert(?OWNERS, {Pid, Tab}), {ok, Tab}; + {Reply, internal_open} -> + %% Clean up what do_internal_open() did: + true = ets:delete(?REGISTRY, Tab), + true = ets:delete(?OWNERS, Pid), + Reply; {Reply, _} -> % ok or Error Reply end, @@ -309,6 +315,12 @@ do_internal_open(State, From, Args) -> [T, _, _] -> T; [_, _] -> Ref end, + %% Pretend the table is open. If someone else tries to + %% open the file it will always become a pending + %% 'add_user' request. If someone tries to use the table + %% there will be a delay, but that is OK. + true = ets:insert(?REGISTRY, {Tab, 1, Pid}), + true = ets:insert(?OWNERS, {Pid, Tab}), pending_call(Tab, Pid, Ref, From, Args, internal_open, State); Error -> {Error, State} diff --git a/lib/stdlib/test/dets_SUITE.erl b/lib/stdlib/test/dets_SUITE.erl index 119b4dc7cb5f..3b08ac165ef5 100644 --- a/lib/stdlib/test/dets_SUITE.erl +++ b/lib/stdlib/test/dets_SUITE.erl @@ -223,8 +223,7 @@ open(Config, Version) -> ?format("Crashing dets server \n", []), process_flag(trap_exit, true), - Procs = [whereis(?DETS_SERVER) | map(fun(Tab) -> dets:info(Tab, pid) end, - Tabs)], + Procs = [whereis(?DETS_SERVER) | [dets:info(Tab, pid) || Tab <- Tabs]], foreach(fun(Pid) -> exit(Pid, kill) end, Procs), timer:sleep(100), c:flush(), %% flush all the EXIT sigs @@ -235,18 +234,32 @@ open(Config, Version) -> open_files(1, All, Version), ?format("Checking contents of repaired files \n", []), check(Tabs, Data), - - close_all(Tabs), + close_all(Tabs), delete_files(All), - P1 = pps(), + {Ports0, Procs0} = P0, - {Ports1, Procs1} = P1, - true = Ports1 =:= Ports0, - %% The dets_server process has been restarted: - [_] = Procs0 -- Procs1, - [_] = Procs1 -- Procs0, - ok. + Test = fun() -> + P1 = pps(), + {Ports1, Procs1} = P1, + show("Old port", Ports0 -- Ports1), + show("New port", Ports1 -- Ports0), + show("Old procs", Procs0 -- Procs1), + show("New procs", Procs1 -- Procs0), + io:format("Remaining Dets-pids (should be nil): ~p~n", + [find_dets_pids()]), + true = Ports1 =:= Ports0, + %% The dets_server process has been restarted: + [_] = Procs0 -- Procs1, + [_] = Procs1 -- Procs0, + ok + end, + case catch Test() of + ok -> ok; + _ -> + timer:sleep(500), + ok = Test() + end. check(Tabs, Data) -> foreach(fun(Tab) -> @@ -3275,12 +3288,22 @@ simultaneous_open(Config) -> File = filename(Tab, Config), ok = monit(Tab, File), - ok = kill_while_repairing(Tab, File), - ok = kill_while_init(Tab, File), - ok = open_ro(Tab, File), - ok = open_w(Tab, File, 0, Config), - ok = open_w(Tab, File, 100, Config), - ok. + case feasible() of + false -> {comment, "OK, but did not run all of the test"}; + true -> + ok = kill_while_repairing(Tab, File), + ok = kill_while_init(Tab, File), + ok = open_ro(Tab, File), + ok = open_w(Tab, File, 0, Config), + ok = open_w(Tab, File, 100, Config) + end. + +feasible() -> + LP = erlang:system_info(logical_processors), + (is_integer(LP) + andalso LP >= erlang:system_info(schedulers_online) + andalso not erlang:system_info(debug_compiled) + andalso not erlang:system_info(lock_checking)). %% One process logs and another process closes the log. Before %% monitors were used, this would make the client never return. @@ -3307,7 +3330,6 @@ kill_while_repairing(Tab, File) -> Delay = 1000, dets:start(), Parent = self(), - Ps = processes(), F = fun() -> R = (catch dets:open_file(Tab, [{file,File}])), timer:sleep(Delay), @@ -3318,7 +3340,7 @@ kill_while_repairing(Tab, File) -> P1 = spawn(F), P2 = spawn(F), P3 = spawn(F), - DetsPid = find_dets_pid([P1, P2, P3 | Ps]), + DetsPid = find_dets_pid(), exit(DetsPid, kill), receive {P1,R1} -> R1 end, @@ -3342,12 +3364,6 @@ kill_while_repairing(Tab, File) -> file:delete(File), ok. -find_dets_pid(P0) -> - case lists:sort(processes() -- P0) of - [P, _] -> P; - _ -> timer:sleep(100), find_dets_pid(P0) - end. - find_dets_pid() -> case find_dets_pids() of [] -> @@ -3421,6 +3437,13 @@ open_ro(Tab, File) -> open_w(Tab, File, Delay, Config) -> create_opened_log(File), + + Tab2 = t2, + File2 = filename(Tab2, Config), + file:delete(File2), + {ok,Tab2} = dets:open_file(Tab2, [{file,File2}]), + ok = dets:close(Tab2), + Parent = self(), F = fun() -> R = dets:open_file(Tab, [{file,File}]), @@ -3430,16 +3453,16 @@ open_w(Tab, File, Delay, Config) -> Pid1 = spawn(F), Pid2 = spawn(F), Pid3 = spawn(F), - undefined = dets:info(Tab), % is repairing now - 0 = qlen(), - Tab2 = t2, - File2 = filename(Tab2, Config), - file:delete(File2), + ok = wait_for_repair_to_start(Tab), + + %% It is assumed that it takes some time to repair the file. {ok,Tab2} = dets:open_file(Tab2, [{file,File2}]), + %% The Dets server managed to handle to open_file request. + 0 = qlen(), % still repairing + ok = dets:close(Tab2), file:delete(File2), - 0 = qlen(), % still repairing receive {Pid1,R1} -> {ok, Tab} = R1 end, receive {Pid2,R2} -> {ok, Tab} = R2 end, @@ -3456,6 +3479,15 @@ open_w(Tab, File, Delay, Config) -> file:delete(File), ok. +wait_for_repair_to_start(Tab) -> + case catch dets_server:get_pid(Tab) of + {'EXIT', _} -> + timer:sleep(1), + wait_for_repair_to_start(Tab); + Pid when is_pid(Pid) -> + ok + end. + qlen() -> {_, {_, N}} = lists:keysearch(message_queue_len, 1, process_info(self())), N. @@ -4350,6 +4382,7 @@ check_badarg({'EXIT', {badarg, [{M,F,A,_} | _]}}, M, F, Args) -> true = test_server:is_native(M) andalso length(Args) =:= A. check_pps({Ports0,Procs0} = P0) -> + ok = check_dets_tables(), case pps() of P0 -> ok; @@ -4375,13 +4408,45 @@ check_pps({Ports0,Procs0} = P0) -> end end. +%% Copied from dets_server.erl: +-define(REGISTRY, dets_registry). +-define(OWNERS, dets_owners). +-define(STORE, dets). + +check_dets_tables() -> + Store = [T || + T <- ets:all(), + ets:info(T, name) =:= ?STORE, + owner(T) =:= dets], + S = case Store of + [Tab] -> ets:tab2list(Tab); + [] -> [] + end, + case {ets:tab2list(?REGISTRY), ets:tab2list(?OWNERS), S} of + {[], [], []} -> ok; + {R, O, _} -> + io:format("Registry: ~p~n", [R]), + io:format("Owners: ~p~n", [O]), + io:format("Store: ~p~n", [S]), + not_ok + end. + +owner(Tab) -> + Owner = ets:info(Tab, owner), + case process_info(Owner, registered_name) of + {registered_name, Name} -> Name; + _ -> Owner + end. + show(_S, []) -> ok; -show(S, [Pid|Pids]) when is_pid(Pid) -> - io:format("~s: ~p~n", [S, erlang:process_info(Pid)]), +show(S, [{Pid, Name, InitCall}|Pids]) when is_pid(Pid) -> + io:format("~s: ~w (~w), ~w: ~p~n", + [S, Pid, proc_reg_name(Name), InitCall, + erlang:process_info(Pid)]), show(S, Pids); -show(S, [Port|Ports]) when is_port(Port)-> - io:format("~s: ~p~n", [S, erlang:port_info(Port)]), +show(S, [{Port, _}|Ports]) when is_port(Port)-> + io:format("~s: ~w: ~p~n", [S, Port, erlang:port_info(Port)]), show(S, Ports). pps() -> @@ -4397,5 +4462,8 @@ process_list() -> safe_second_element(process_info(P, initial_call))} || P <- processes()]. +proc_reg_name({registered_name, Name}) -> Name; +proc_reg_name([]) -> no_reg_name. + safe_second_element({_,Info}) -> Info; safe_second_element(Other) -> Other.