Skip to content

Commit

Permalink
Add a few more commit and query engine metrics
Browse files Browse the repository at this point in the history
fsync times can vary widely, especially on a remote block storage, so add a
histogram and count for fsync calls. At a higher level, we can coalesce similar
update doc requests from multiple clients, so add metrics to track that as
well.

Improving couch_proc_manager, and experimenting with quickjs js engine [1],
noticed we lacked metrics in that area. Add metrics to track query process
acquires, start/stops (including crashes) as well as how many prompt and prompt
errors we have. These should help gain some confidence when upgrading to a new
engine version or engine type by comparing the before and after metrics output.

Trying to test some of these new metrics, noticed couch_file tests did most of
the test logic in the setup part, which resulted in plenty of log noise during
the test runs about started/stopped applications, so updated a few tests which
were affected by this PR to use the ?TDEF_FE macro. That should make test
output a bit smaller and neater.

[1] #4448 (comment)
  • Loading branch information
nickva committed May 20, 2023
1 parent 116c5b0 commit 951f1b9
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 50 deletions.
44 changes: 44 additions & 0 deletions src/couch/priv/stats_descriptions.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,18 @@
{type, histogram},
{desc, <<"length of a request inside CouchDB without MochiWeb">>}
]}.
{[couchdb, commits], [
{type, counter},
{desc, <<"number of commits performed">>}
]}.
{[couchdb, coalesced_updates, interactive], [
{type, counter},
{desc, <<"number of coalesced interactive updates">>}
]}.
{[couchdb, coalesced_updates, replicated], [
{type, counter},
{desc, <<"number of coalesced replicated updates">>}
]}.
{[couchdb, couch_server, lru_skip], [
{type, counter},
{desc, <<"number of couch_server LRU operations skipped">>}
Expand All @@ -290,6 +302,30 @@
{type, histogram},
{desc, <<"duration of validate_doc_update function calls">>}
]}.
{[couchdb, query_server, acquired_processes], [
{type, counter},
{desc, <<"number of acquired external processes">>}
]}.
{[couchdb, query_server, process_starts], [
{type, counter},
{desc, <<"number of OS process starts">>}
]}.
{[couchdb, query_server, process_exists], [
{type, counter},
{desc, <<"number of OS normal process exits">>}
]}.
{[couchdb, query_server, process_errors], [
{type, counter},
{desc, <<"number of OS error process exits">>}
]}.
{[couchdb, query_server, process_prompts], [
{type, counter},
{desc, <<"number of successful OS process prompts">>}
]}.
{[couchdb, query_server, process_prompt_errors], [
{type, counter},
{desc, <<"number of OS process prompt errors">>}
]}.
{[pread, exceed_eof], [
{type, counter},
{desc, <<"number of the attempts to read beyond end of db file">>}
Expand All @@ -298,6 +334,14 @@
{type, counter},
{desc, <<"number of the attempts to read beyond set limit">>}
]}.
{[fsync, time], [
{type, histogram},
{desc, <<"microseconds to call fsync">>}
]}.
{[fsync, count], [
{type, counter},
{desc, <<"number of fsync calls">>}
]}.
{[mango, unindexed_queries], [
{type, counter},
{desc, <<"number of mango queries that could not use an index">>}
Expand Down
1 change: 1 addition & 0 deletions src/couch/src/couch_bt_engine.erl
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ commit_data(St) ->
couch_file:sync(Fd),
ok = couch_file:write_header(Fd, NewHeader),
couch_file:sync(Fd),
couch_stats:increment_counter([couchdb, commits]),
{ok, St#st{
header = NewHeader,
needs_commit = false
Expand Down
4 changes: 4 additions & 0 deletions src/couch/src/couch_db_updater.erl
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,10 @@ collect_updates(GroupedDocsAcc, ClientsAcc, MergeConflicts) ->
% updaters than deal with their possible conflicts, and local docs
% writes are relatively rare. Can be optmized later if really needed.
{update_docs, Client, GroupedDocs, [], MergeConflicts} ->
case MergeConflicts of
true -> couch_stats:increment_counter([couchdb, coalesced_updates, replicated]);
false -> couch_stats:increment_counter([couchdb, coalesced_updates, interactive])
end,
GroupedDocs2 = sort_and_tag_grouped_docs(Client, GroupedDocs),
GroupedDocsAcc2 =
merge_updates(GroupedDocsAcc, GroupedDocs2),
Expand Down
14 changes: 11 additions & 3 deletions src/couch/src/couch_file.erl
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ sync(Filepath) when is_list(Filepath) ->
case file:open(Filepath, [append, raw]) of
{ok, Fd} ->
try
case file:sync(Fd) of
case fsync(Fd) of
ok ->
ok;
{error, Reason} ->
Expand Down Expand Up @@ -437,7 +437,7 @@ init({Filepath, Options, ReturnPid, Ref}) ->
true ->
{ok, 0} = file:position(Fd, 0),
ok = file:truncate(Fd),
ok = file:sync(Fd),
ok = fsync(Fd),
maybe_track_open_os_files(Options),
erlang:send_after(?INITIAL_WAIT, self(), maybe_close),
{ok, #file{fd = Fd, is_sys = IsSys, pread_limit = Limit}};
Expand Down Expand Up @@ -554,7 +554,7 @@ handle_call({set_db_pid, Pid}, _From, #file{db_monitor = OldRef} = File) ->
Ref = monitor(process, Pid),
{reply, ok, File#file{db_monitor = Ref}};
handle_call(sync, _From, #file{fd = Fd} = File) ->
case file:sync(Fd) of
case fsync(Fd) of
ok ->
{reply, ok, File};
{error, _} = Error ->
Expand Down Expand Up @@ -642,6 +642,14 @@ format_status(_Opt, [PDict, #file{} = File]) ->
{_Fd, FilePath} = couch_util:get_value(couch_file_fd, PDict),
[{data, [{"State", File}, {"InitialFilePath", FilePath}]}].

fsync(Fd) ->
T0 = erlang:monotonic_time(microsecond),
Res = file:sync(Fd),
Dt = erlang:monotonic_time(microsecond) - T0,
couch_stats:update_histogram([fsync, time], Dt),
couch_stats:increment_counter([fsync, count]),
Res.

find_header(Fd, Block) ->
case (catch load_header(Fd, Block)) of
{ok, Bin} ->
Expand Down
7 changes: 7 additions & 0 deletions src/couch/src/couch_os_process.erl
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ init([Command, Options, PortOptions]) ->
KillCmd = iolist_to_binary(readline(BaseProc)),
Pid = self(),
couch_log:debug("OS Process Start :: ~p", [BaseProc#os_proc.port]),
couch_stats:increment_counter([couchdb, query_server, process_starts]),
spawn(fun() ->
% this ensure the real os process is killed when this process dies.
erlang:monitor(process, Pid),
Expand Down Expand Up @@ -207,13 +208,17 @@ handle_call({prompt, Data}, _From, #os_proc{idle = Idle} = OsProc) ->
#os_proc{writer = Writer, reader = Reader} = OsProc,
try
Writer(OsProc, Data),
couch_stats:increment_counter([couchdb, query_server, process_prompts]),
{reply, {ok, Reader(OsProc)}, OsProc, Idle}
catch
throw:{error, OsError} ->
couch_stats:increment_counter([couchdb, query_server, process_errors]),
{reply, OsError, OsProc, Idle};
throw:{fatal, OsError} ->
couch_stats:increment_counter([couchdb, query_server, process_errors]),
{stop, normal, OsError, OsProc};
throw:OtherError ->
couch_stats:increment_counter([couchdb, query_server, process_errors]),
{stop, normal, OtherError, OsProc}
after
garbage_collect()
Expand Down Expand Up @@ -243,9 +248,11 @@ handle_info(timeout, #os_proc{idle = Idle} = OsProc) ->
{noreply, OsProc, Idle};
handle_info({Port, {exit_status, 0}}, #os_proc{port = Port} = OsProc) ->
couch_log:info("OS Process terminated normally", []),
couch_stats:increment_counter([couchdb, query_server, process_exits]),
{stop, normal, OsProc};
handle_info({Port, {exit_status, Status}}, #os_proc{port = Port} = OsProc) ->
couch_log:error("OS Process died with status: ~p", [Status]),
couch_stats:increment_counter([couchdb, query_server, process_error_exits]),
{stop, {exit_status, Status}, OsProc};
handle_info(Msg, #os_proc{idle = Idle} = OsProc) ->
couch_log:debug("OS Proc: Unknown info: ~p", [Msg]),
Expand Down
8 changes: 6 additions & 2 deletions src/couch/src/couch_proc_manager.erl
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,17 @@ get_proc(#doc{body = {Props}} = DDoc, DbKey, {_DDocId, _Rev} = DDocKey) ->
Lang = couch_util:to_binary(LangStr),
Client = #client{lang = Lang, ddoc = DDoc, db_key = DbKey, ddoc_key = DDocKey},
Timeout = get_os_process_timeout(),
gen_server:call(?MODULE, {get_proc, Client}, Timeout).
Res = gen_server:call(?MODULE, {get_proc, Client}, Timeout),
couch_stats:increment_counter([couchdb, query_server, acquired_processes]),
Res.

get_proc(LangStr) ->
Lang = couch_util:to_binary(LangStr),
Client = #client{lang = Lang},
Timeout = get_os_process_timeout(),
gen_server:call(?MODULE, {get_proc, Client}, Timeout).
Res = gen_server:call(?MODULE, {get_proc, Client}, Timeout),
couch_stats:increment_counter([couchdb, query_server, acquired_processes]),
Res.

ret_proc(#proc{} = Proc) ->
gen_server:call(?MODULE, {ret_proc, Proc}, infinity).
Expand Down
115 changes: 72 additions & 43 deletions src/couch/test/eunit/couch_file_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -70,67 +70,86 @@ read_write_test_() ->
"Common file read/write tests",
{
setup,
fun() -> test_util:start(?MODULE, [ioq]) end,
fun test_util:stop/1,
?foreach([
fun should_increase_file_size_on_write/1,
fun should_return_current_file_size_on_write/1,
fun should_write_and_read_term/1,
fun should_write_and_read_binary/1,
fun should_write_and_read_large_binary/1,
fun should_return_term_as_binary_for_reading_binary/1,
fun should_read_term_written_as_binary/1,
fun should_read_iolist/1,
fun should_fsync/1,
fun should_not_read_beyond_eof/1,
fun should_truncate/1
])
fun() -> test_util:start_couch() end,
fun test_util:stop_couch/1,
{
foreach,
fun setup/0,
fun teardown/1,
[
?TDEF_FE(should_increase_file_size_on_write),
?TDEF_FE(should_return_current_file_size_on_write),
?TDEF_FE(should_write_and_read_term),
?TDEF_FE(should_write_and_read_binary),
?TDEF_FE(should_write_and_read_large_binary),
?TDEF_FE(should_return_term_as_binary_for_reading_binary),
?TDEF_FE(should_read_term_written_as_binary),
?TDEF_FE(should_read_iolist),
?TDEF_FE(should_fsync),
?TDEF_FE(should_update_fsync_stats),
?TDEF_FE(should_not_read_beyond_eof),
?TDEF_FE(should_truncate)
]
}
}
}.

should_increase_file_size_on_write(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
{ok, Size} = couch_file:bytes(Fd),
?_assert(Size > 0).
?assert(Size > 0).

should_return_current_file_size_on_write(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
{ok, Size} = couch_file:bytes(Fd),
?_assertMatch({ok, Size, _}, couch_file:append_term(Fd, bar)).
?assertMatch({ok, Size, _}, couch_file:append_term(Fd, bar)).

should_write_and_read_term(Fd) ->
{ok, Pos, _} = couch_file:append_term(Fd, foo),
?_assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
?assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).

should_write_and_read_binary(Fd) ->
{ok, Pos, _} = couch_file:append_binary(Fd, <<"fancy!">>),
?_assertMatch({ok, <<"fancy!">>}, couch_file:pread_binary(Fd, Pos)).
?assertMatch({ok, <<"fancy!">>}, couch_file:pread_binary(Fd, Pos)).

should_return_term_as_binary_for_reading_binary(Fd) ->
{ok, Pos, _} = couch_file:append_term(Fd, foo),
Foo = couch_compress:compress(foo, snappy),
?_assertMatch({ok, Foo}, couch_file:pread_binary(Fd, Pos)).
?assertMatch({ok, Foo}, couch_file:pread_binary(Fd, Pos)).

should_read_term_written_as_binary(Fd) ->
{ok, Pos, _} = couch_file:append_binary(Fd, <<131, 100, 0, 3, 102, 111, 111>>),
?_assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).
?assertMatch({ok, foo}, couch_file:pread_term(Fd, Pos)).

should_write_and_read_large_binary(Fd) ->
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, Pos, _} = couch_file:append_binary(Fd, BigBin),
?_assertMatch({ok, BigBin}, couch_file:pread_binary(Fd, Pos)).
?assertMatch({ok, BigBin}, couch_file:pread_binary(Fd, Pos)).

should_read_iolist(Fd) ->
%% append_binary == append_iolist?
%% Possible bug in pread_iolist or iolist() -> append_binary
{ok, Pos, _} = couch_file:append_binary(Fd, ["foo", $m, <<"bam">>]),
{ok, IoList} = couch_file:pread_iolist(Fd, Pos),
?_assertMatch(<<"foombam">>, iolist_to_binary(IoList)).
?assertMatch(<<"foombam">>, iolist_to_binary(IoList)).

should_fsync(Fd) ->
{"How does on test fsync?", ?_assertMatch(ok, couch_file:sync(Fd))}.

should_not_read_beyond_eof(Fd) ->
?assertMatch(ok, couch_file:sync(Fd)).

should_update_fsync_stats(Fd) ->
Count0 = couch_stats:sample([fsync, count]),
Seq = lists:seq(1, 10),
lists:foreach(fun(_) -> ok = couch_file:sync(Fd) end, Seq),
Hist = couch_stats:sample([fsync, time]),
Count1 = couch_stats:sample([fsync, count]),
?assert(Count1 > Count0),
HistMax = proplists:get_value(max, Hist),
HistPct = proplists:get_value(percentile, Hist),
?assert(HistMax > 0),
?assertMatch([{50, P50} | _] when P50 > 0, HistPct).

should_not_read_beyond_eof(_) ->
{ok, Fd} = couch_file:open(?tempfile(), [create, overwrite]),
BigBin = list_to_binary(lists:duplicate(100000, 0)),
DoubleBin = round(byte_size(BigBin) * 2),
{ok, Pos, _Size} = couch_file:append_binary(Fd, BigBin),
Expand All @@ -140,48 +159,58 @@ should_not_read_beyond_eof(Fd) ->
ok = file:pwrite(Io, Pos, <<0:1/integer, DoubleBin:31/integer>>),
file:close(Io),
unlink(Fd),
ExpectedError = {badmatch, {'EXIT', {bad_return_value, {read_beyond_eof, Filepath}}}},
?_assertError(ExpectedError, couch_file:pread_binary(Fd, Pos)).
ExpectExit = {bad_return_value, {read_beyond_eof, Filepath}},
ExpectError = {badmatch, {'EXIT', ExpectExit}},
?assertError(ExpectError, couch_file:pread_binary(Fd, Pos)),
catch file:close(Fd).

should_truncate(Fd) ->
{ok, 0, _} = couch_file:append_term(Fd, foo),
{ok, Size} = couch_file:bytes(Fd),
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, _, _} = couch_file:append_binary(Fd, BigBin),
ok = couch_file:truncate(Fd, Size),
?_assertMatch({ok, foo}, couch_file:pread_term(Fd, 0)).
?assertMatch({ok, foo}, couch_file:pread_term(Fd, 0)).

pread_limit_test_() ->
{
"Read limit tests",
{
setup,
fun() ->
Ctx = test_util:start(?MODULE),
config:set("couchdb", "max_pread_size", "50000"),
Ctx = test_util:start_couch([ioq]),
config:set("couchdb", "max_pread_size", "50000", _Persist = false),
Ctx
end,
fun(Ctx) ->
config:delete("couchdb", "max_pread_size"),
test_util:stop(Ctx)
config:delete("couchdb", "max_pread_size", _Persist = false),
test_util:stop_couch(Ctx)
end,
?foreach([
fun should_increase_file_size_on_write/1,
fun should_return_current_file_size_on_write/1,
fun should_write_and_read_term/1,
fun should_write_and_read_binary/1,
fun should_not_read_more_than_pread_limit/1
])
{
foreach,
fun setup/0,
fun teardown/1,
[
?TDEF_FE(should_increase_file_size_on_write),
?TDEF_FE(should_return_current_file_size_on_write),
?TDEF_FE(should_write_and_read_term),
?TDEF_FE(should_write_and_read_binary),
?TDEF_FE(should_not_read_more_than_pread_limit)
]
}
}
}.

should_not_read_more_than_pread_limit(Fd) ->
should_not_read_more_than_pread_limit(_) ->
{ok, Fd} = couch_file:open(?tempfile(), [create, overwrite]),
{_, Filepath} = couch_file:process_info(Fd),
BigBin = list_to_binary(lists:duplicate(100000, 0)),
{ok, Pos, _Size} = couch_file:append_binary(Fd, BigBin),
unlink(Fd),
ExpectedError = {badmatch, {'EXIT', {bad_return_value, {exceed_pread_limit, Filepath, 50000}}}},
?_assertError(ExpectedError, couch_file:pread_binary(Fd, Pos)).
ExpectExit = {bad_return_value, {exceed_pread_limit, Filepath, 50000}},
ExpectError = {badmatch, {'EXIT', ExpectExit}},
?assertError(ExpectError, couch_file:pread_binary(Fd, Pos)),
catch file:close(Fd).

header_test_() ->
{
Expand Down
8 changes: 7 additions & 1 deletion src/couch/test/eunit/couch_js_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ should_create_sandbox() ->
?assertMatch([[[true, <<_/binary>>]]], Result),
[[[true, ErrMsg]]] = Result,
?assertNotEqual([], binary:matches(ErrMsg, <<"not defined">>)),
?assert(couch_stats:sample([couchdb, query_server, process_starts]) > 0),
?assert(couch_stats:sample([couchdb, query_server, process_prompts]) > 0),
?assert(couch_stats:sample([couchdb, query_server, acquired_processes]) > 0),
?assert(couch_stats:sample([couchdb, query_server, process_exists]) >= 0),
?assert(couch_stats:sample([couchdb, query_server, process_errors]) >= 0),
couch_query_servers:ret_os_process(Proc).

%% erlfmt-ignore
Expand Down Expand Up @@ -274,7 +279,8 @@ should_exit_on_internal_error() ->
% It may fail and just exit the process. That's expected as well
throw:{os_process_error, _} ->
ok
end.
end,
?assert(couch_stats:sample([couchdb, query_server, process_errors]) > 0).

trigger_oom(Proc) ->
Status =
Expand Down
Loading

0 comments on commit 951f1b9

Please sign in to comment.