Skip to content

Commit

Permalink
Merge pull request #4507 from apache/prometheus_metrics
Browse files Browse the repository at this point in the history
feat: additional prometheus metrics
  • Loading branch information
willholley committed Apr 3, 2023
2 parents d141255 + 8c1ef5b commit a9bce2f
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
"postCreateCommand": "./configure && make",

"extensions": ["erlang-ls.erlang-ls"]
}
}
1 change: 0 additions & 1 deletion src/couch/src/couch.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
ioq,
couch_stats,
hyper,
couch_prometheus,
couch_dist
]},
{env, [
Expand Down
2 changes: 1 addition & 1 deletion src/couch_prometheus/src/couch_prometheus.app.src
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
{description, "Aggregated metrics info for Prometheus consumption"},
{vsn, git},
{registered, []},
{applications, [kernel, stdlib, folsom, couch_stats, couch_log]},
{applications, [kernel, stdlib, folsom, couch_stats, couch_log, mem3, couch]},
{mod, {couch_prometheus_app, []}},
{env, []}
]}.
51 changes: 31 additions & 20 deletions src/couch_prometheus/src/couch_prometheus_server.erl
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,33 @@ get_system_stats() ->
get_message_queue_stats(),
get_run_queue_stats(),
get_vm_stats(),
get_ets_stats()
get_ets_stats(),
get_internal_replication_jobs_stat(),
get_membership_stat()
]).

get_uptime_stat() ->
to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000).

get_internal_replication_jobs_stat() ->
to_prom(
internal_replication_jobs,
gauge,
"count of internal replication changes to process",
mem3_sync:get_backlog()
).

get_membership_stat() ->
% expected nodes
ClusterNodes = mem3:nodes(),
% connected nodes
AllNodes = nodes([this, visible]),
Labels = [
{[{nodes, "cluster_nodes"}], length(ClusterNodes)},
{[{nodes, "all_nodes"}], length(AllNodes)}
],
to_prom(membership, gauge, "count of nodes in the cluster", Labels).

get_vm_stats() ->
MemLabels = lists:map(
fun({Type, Value}) ->
Expand Down Expand Up @@ -177,24 +198,27 @@ get_io_stats() ->
].

get_message_queue_stats() ->
QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end,
Queues = lists:map(QLenFun, registered()),
QFun = fun(Name) -> {Name, message_queue_len(whereis(Name))} end,
Queues = lists:map(QFun, registered()),
QueueLens = lists:map(fun({_, Len}) -> Len end, Queues),
QueueLenByLabel = lists:map(fun({Name, Len}) -> {[{queue_name, Name}], Len} end, Queues),
[
to_prom(
erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues)
erlang_message_queues, gauge, "total size of all message queues", lists:sum(QueueLens)
),
to_prom(
erlang_message_queue_min,
gauge,
"minimum size across all message queues",
lists:min(Queues)
lists:min(QueueLens)
),
to_prom(
erlang_message_queue_max,
gauge,
"maximum size across all message queues",
lists:max(Queues)
)
lists:max(QueueLens)
),
to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel)
].

message_queue_len(undefined) ->
Expand Down Expand Up @@ -247,19 +271,6 @@ update_refresh_timer() ->

-include_lib("couch/include/couch_eunit.hrl").

system_stats_test() ->
lists:foreach(
fun(Line) ->
?assert(is_binary(Line)),
Trimmed = string:trim(Line),
?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed))
end,
get_system_stats()
).

starts_with(Prefix, Line) when is_binary(Prefix), is_binary(Line) ->
binary:longest_common_prefix([Prefix, Line]) > 0.

message_queue_len_test() ->
self() ! refresh,
?assert(message_queue_len(self()) >= 1),
Expand Down
21 changes: 19 additions & 2 deletions src/couch_prometheus/src/couch_prometheus_util.erl
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
});
couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
% Convert to gauge in prometheus type. This is required because
% prometheus assumes that counters are cumulative and should be
% rated by default, whereas folsom (the library CouchDB uses for
% metrics) allows counters to be decremented as well. Folsom supports
% gauges but does not track their state to allow increment/decrement.
% Basically, anywhere we use couch_stats:decrement_count we should
% be converting to a prometheus gauge.
couch_to_prom([couchdb, open_databases], Info, _All) ->
to_prom(open_databases, gauge, desc(Info), val(Info));
couch_to_prom([couchdb, open_os_files], Info, _All) ->
to_prom(open_os_files, gauge, desc(Info), val(Info));
couch_to_prom([couchdb, httpd, clients_requesting_changes], Info, _All) ->
to_prom(httpd_clients_requesting_changes, gauge, desc(Info), val(Info));
couch_to_prom([ddoc_cache, hit], Info, All) ->
Total = val(Info) + val([ddoc_cache, miss], All),
to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
Expand Down Expand Up @@ -109,9 +122,13 @@ type_def(Metric, Type, Desc) ->
to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
].

to_prom(Metric, Type, Desc, Data) ->
% support creating a metric series with multiple label/values.
% Instances is of the form [{[{LabelName, LabelValue}], Value}, ...]
to_prom(Metric, Type, Desc, Instances) when is_list(Instances) ->
TypeStr = type_def(Metric, Type, Desc),
[TypeStr] ++ to_prom(Metric, Data).
[TypeStr] ++ lists:flatmap(fun(Inst) -> to_prom(Metric, Inst) end, Instances);
to_prom(Metric, Type, Desc, Data) ->
to_prom(Metric, Type, Desc, [Data]).

to_prom(Metric, Instances) when is_list(Instances) ->
lists:flatmap(fun(Inst) -> to_prom(Metric, Inst) end, Instances);
Expand Down
32 changes: 30 additions & 2 deletions src/couch_prometheus/test/eunit/couch_prometheus_e2e_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ e2e_test_() ->
?TDEF_FE(t_chttpd_port),
?TDEF_FE(t_prometheus_port),
?TDEF_FE(t_metric_updated),
?TDEF_FE(t_no_duplicate_metrics)
?TDEF_FE(t_no_duplicate_metrics),
?TDEF_FE(t_starts_with_couchdb)
]
}
}
Expand Down Expand Up @@ -70,7 +71,7 @@ reject_test_() ->
}.

setup_prometheus(WithAdditionalPort) ->
Ctx = test_util:start_couch([chttpd]),
Ctx = test_util:start_couch([mem3, chttpd, couch_prometheus]),
Persist = false,
Hashed = couch_passwords:hash_admin_password(?PASS),
ok = config:set("admins", ?USER, binary_to_list(Hashed), Persist),
Expand Down Expand Up @@ -145,6 +146,33 @@ t_metric_updated(Port) ->
end
).

t_starts_with_couchdb(Port) ->
Url = node_local_url(Port),
Stats = get_stats(Url),
Lines = re:split(Stats, "\n"),
lists:foreach(
fun(Line) ->
?assert(is_binary(Line)),
Trimmed = string:trim(Line),
Expect = "^(#|couchdb_|$)",
case re:run(Trimmed, Expect) of
{match, _} ->
ok;
nomatch ->
erlang:error(
{assertRegexp_failed, [
{module, ?MODULE},
{line, ?LINE},
{regexp, (Trimmed)},
{expected_to_match, Expect},
{result, nomatch}
]}
)
end
end,
Lines
).

node_local_url(Port) ->
Addr = config:get("chttpd", "bind_address", "127.0.0.1"),
lists:concat(["http://", Addr, ":", Port, "/_node/_local/_prometheus"]).
Expand Down

0 comments on commit a9bce2f

Please sign in to comment.