Skip to content
Permalink
Browse files
Add Scheduler usage metrics
  • Loading branch information
ferd committed Mar 25, 2014
1 parent 8494d34 commit 984f6c5ade87e30248972d68423998ecfeabdf15
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 14 deletions.
@@ -27,6 +27,9 @@ all stable changes of the first version of Recon.

*2.x*

- 2.2.0 Upcoming (master branch, still in development):
- Adding scheduler info metrics to get a more accurate picture than what
top and CPU gives.
- 2.1.2:
- Fixing tests for R15B02 and up
- Fixing a backwards compatibility for R15B03 on `recon_alloc` operations
@@ -1,6 +1,6 @@
{application, recon,
[{description, "Diagnostic tools for production use"},
{vsn, "2.1.2"},
{vsn, "2.2.0"},
{modules, [recon]},
{registered, []},
{applications, [kernel, stdlib]}]}.
@@ -27,7 +27,8 @@
%%% {@link node_stats_print/2}, which displays them,
%%% {@link node_stats_list/2}, which returns them in a list, and
%%% {@link node_stats/4}, which provides a fold-like interface
%%% for stats gathering.</dd>
%%% for stats gathering. For CPU usage specifically, see
%%% {@link scheduler_usage/1}.</dd>
%%%
%%% <dt>2. OTP tools</dt>
%%% <dd>This category provides tools to interact with pieces of OTP
@@ -80,7 +81,8 @@
-export([info/1, info/2, info/3, info/4,
proc_count/2, proc_window/3,
bin_leak/1,
node_stats_print/2, node_stats_list/2, node_stats/4]).
node_stats_print/2, node_stats_list/2, node_stats/4,
scheduler_usage/1]).
-export([get_state/1, get_state/2]).
-export([remote_load/1, remote_load/2,
source/1]).
@@ -133,7 +135,6 @@
| port_info_io_key() | port_info_memory_key()
| port_info_specific_key().


-export_type([proc_attrs/0, inet_attrs/0, pid_term/0, port_term/0]).
-export_type([info_type/0, info_key/0,
info_meta_key/0, info_signals_key/0, info_location_key/0,
@@ -336,6 +337,39 @@ bin_leak(N) ->
node_stats_print(N, Interval) ->
node_stats(N, Interval, fun(X, _) -> io:format("~p~n",[X]) end, ok).

%% @doc Because Erlang CPU usage as reported from `top' isn't the most
%% reliable value (due to schedulers doing idle spinning to avoid going
%% to sleep and impacting latency), a metric exists that is based on
%% scheduler wall time.
%%
%% For any time interval, Scheduler wall time can be used as a measure
%% of how 'busy' a scheduler is. A scheduler is busy when:
%%
%% <ul>
%% <li>executing process code</li>
%% <li>executing driver code</li>
%% <li>executing NIF code</li>
%% <li>executing BIFs</li>
%% <li>garbage collecting</li>
%% <li>doing memory management</li>
%% </ul>
%%
%% A scheduler isn't busy when doing anything else.
-spec scheduler_usage(Millisecs) -> [{SchedulerId, Usage}] when
Millisecs :: non_neg_integer(),
SchedulerId :: pos_integer(),
Usage :: number().
scheduler_usage(Interval) when is_integer(Interval) ->
%% We start and stop the scheduler_wall_time system flag if
%% it wasn't in place already. Usually setting the flag should
%% have a CPU impact (making it higher) only when under low usage.
FormerFlag = erlang:system_flag(scheduler_wall_time, true),
First = erlang:statistics(scheduler_wall_time),
timer:sleep(Interval),
Last = erlang:statistics(scheduler_wall_time),
erlang:system_flag(scheduler_wall_time, FormerFlag),
recon_lib:scheduler_usage_diff(First, Last).

%% @doc Shorthand for `node_stats(N, Interval, fun(X,Acc) -> [X|Acc] end, [])'
%% with the results reversed to be in the right temporal order.
-spec node_stats_list(Repeat, Interval) -> [Stats] when
@@ -368,8 +402,10 @@ node_stats_list(N, Interval) ->
Stats :: {[Absolutes::{atom(),term()}],
[Increments::{atom(),term()}]}.
node_stats(N, Interval, FoldFun, Init) ->
%% Turn on scheduler wall time if it wasn't there already
FormerFlag = erlang:system_flag(scheduler_wall_time, true),
%% Stats is an ugly fun, but it does its thing.
Stats = fun({{OldIn,OldOut},{OldGCs,OldWords,_}}) ->
Stats = fun({{OldIn,OldOut},{OldGCs,OldWords,_}, SchedWall}) ->
%% Absolutes
ProcC = erlang:system_info(process_count),
RunQ = erlang:statistics(run_queue),
@@ -389,20 +425,29 @@ node_stats(N, Interval, FoldFun, Init) ->
GCCount = GCs-OldGCs,
GCWords = Words-OldWords,
{_, Reds} = erlang:statistics(reductions),
SchedWallNew = erlang:statistics(scheduler_wall_time),
SchedUsage = recon_lib:scheduler_usage_diff(SchedWall, SchedWallNew),
%% Stats Results
{{[{process_count,ProcC}, {run_queue,RunQ},
{error_logger_queue_len,LogQ}, {memory_total,Tot},
{memory_procs,ProcM}, {memory_atoms,Atom},
{memory_bin,Bin}, {memory_ets,Ets}],
[{bytes_in,BytesIn}, {bytes_out,BytesOut},
{gc_count,GCCount}, {gc_words_reclaimed,GCWords},
{reductions,Reds}]},
{reductions,Reds}, {scheduler_usage, SchedUsage}]},
%% New State
{{In,Out}, GC}}
{{In,Out}, GC, SchedWallNew}}
end,
{{input,In},{output,Out}} = erlang:statistics(io),
Gc = erlang:statistics(garbage_collection),
recon_lib:time_fold(N, Interval, Stats, {{In,Out}, Gc}, FoldFun, Init).
SchedWall = erlang:statistics(scheduler_wall_time),
Result = recon_lib:time_fold(
N, Interval, Stats,
{{In,Out}, Gc, SchedWall},
FoldFun, Init),
%% Set scheduler wall time back to what it was
erlang:system_flag(scheduler_wall_time, FormerFlag),
Result.

%%% OTP & Manipulations %%%

@@ -11,7 +11,8 @@
inet_attrs/1, inet_attrs/2,
triple_to_pid/3, term_to_pid/1,
term_to_port/1,
time_map/5, time_fold/6]).
time_map/5, time_fold/6,
scheduler_usage_diff/2]).
%% private exports
-export([binary_memory/1]).

@@ -207,6 +208,20 @@ time_fold(N, Interval, Fun, State, FoldFun, Init) ->
Acc = FoldFun(Res,Init),
time_fold(N-1,Interval,Fun,NewState,FoldFun,Acc).

%% @doc Diffs two runs of erlang:statistics(scheduler_wall_time) and
%% returns usage metrics in terms of cores and 0..1 percentages.
-spec scheduler_usage_diff(SchedTime, SchedTime) -> [{SchedulerId, Usage}] when
SchedTime :: [{SchedulerId, ActiveTime, TotalTime}],
SchedulerId :: pos_integer(),
Usage :: number(),
ActiveTime :: non_neg_integer(),
TotalTime :: non_neg_integer().
scheduler_usage_diff(First, Last) ->
lists:map(
fun({{I, A0, T0}, {I, A1, T1}}) -> {I, (A1 - A0)/(T1 - T0)} end,
lists:zip(lists:sort(First), lists:sort(Last))
).

%% @private crush binaries from process_info into their amount of place
%% taken in memory.
binary_memory(Bins) ->
@@ -110,7 +110,7 @@
%%% Note that in the pattern above, no specific function (<code>'_'</code>) was
%%% matched against. Instead, the `fun' used restricted functions to those
%%% having two arguments, the first of which is either a list or an integer
%%% greater than `3'.
%%% greater than `1'.
%%%
%%% The limit was also set using `{10,100}' instead of an integer, making the
%%% rate-limitting at 10 messages per 100 milliseconds, instead of an absolute
@@ -9,7 +9,7 @@

all() -> [{group,info}, proc_count, proc_window, bin_leak,
node_stats_list, get_state, source, tcp, udp, files, port_types,
inet_count, inet_window, binary_memory].
inet_count, inet_window, binary_memory, scheduler_usage].

groups() -> [{info, [], [info3, info4, info1, info2,
port_info1, port_info2]}].
@@ -142,12 +142,13 @@ node_stats_list(_Config) ->
{memory_procs,_},
{memory_atoms,_},
{memory_bin,_},
{memory_ets,_}],
{memory_ets,_}|_],
[{bytes_in,_},
{bytes_out,_},
{gc_count,_},
{gc_words_reclaimed,_},
{reductions,_}]} <- Res]).
{reductions,_},
{scheduler_usage,[_|_]}|_]} <- Res]).

get_state(_Config) ->
Res = recon:get_state(kernel_sup),
@@ -252,6 +253,17 @@ binary_memory(_Config) ->
[{binary,X}, {binary_memory,_}, {binary,X}] = Res1,
[{binary_memory,Y}, {binary,_}, {binary_memory,Y}] = Res2.

%% Just check that we get many schedulers and that the usage values are all
%% between 0 and 1 inclusively. We don't care for edge cases like a
%% scheduler disappearing halfway through a run.
scheduler_usage(_Config) ->
List = recon:scheduler_usage(100),
?assertEqual(length(List), length(
[1 || {Id,Rate} <- List,
is_integer(Id), Id > 0,
Rate >= 0, Rate =< 1])
).

%%%%%%%%%%%%%%%
%%% HELPERS %%%
%%%%%%%%%%%%%%%
@@ -35,7 +35,6 @@ memory(_Config) ->
%% Current vs. Max
true = Used =< recon_alloc:memory(used, max),
true = Alloc =< recon_alloc:memory(allocated, max),
true = Unused =< recon_alloc:memory(unused, max),
true = Types =< recon_alloc:memory(allocated_types, max),
true = Instances =< recon_alloc:memory(allocated_instances, max),
%% Key presences in allocateds

0 comments on commit 984f6c5

Please sign in to comment.