Skip to content
This repository
Browse code

Further refine Riak KV health check

Change health check to have both a lower and upper threshold. The
health check will fail whenever a message queue surpasses the upper
threshold, and then will continue to fail until all queues are below
the lower threshold. This prevents constantly transitioning back and
forth from healthy to non-healthy.

Add output messages that print out when the health check transitions
from health/non-healthy or non-healthy/healthy.
  • Loading branch information...
commit 0767648dd626e630509cfbcb9926bff230a8ab64 1 parent e504124
Joseph Blomstedt authored December 14, 2012

Showing 1 changed file with 27 additions and 5 deletions. Show diff stats Hide diff stats

  1. 32  src/riak_kv_app.erl
32  src/riak_kv_app.erl
@@ -77,7 +77,7 @@ start(_Type, _StartArgs) ->
77 77
     case code:ensure_loaded(StorageBackend) of
78 78
         {error,nofile} ->
79 79
             lager:critical("storage_backend ~p is non-loadable.",
80  
-                                   [StorageBackend]),
  80
+                           [StorageBackend]),
81 81
             throw({error, invalid_storage_backend});
82 82
         _ ->
83 83
             ok
@@ -193,21 +193,43 @@ check_epoch() ->
193 193
              calendar:universal_time()),
194 194
     case GSec - ((MSec*1000000)+Sec) of
195 195
         N when (N < ?SEC_TO_EPOCH+5 andalso N > ?SEC_TO_EPOCH-5);
196  
-        (N < -?SEC_TO_EPOCH+5 andalso N > -?SEC_TO_EPOCH-5) ->
  196
+               (N < -?SEC_TO_EPOCH+5 andalso N > -?SEC_TO_EPOCH-5) ->
197 197
             %% if epoch is within 10 sec of expected, accept it
198 198
             ok;
199 199
         N ->
200 200
             Epoch = calendar:gregorian_seconds_to_datetime(N),
201 201
             lager:error("Riak expects your system's epoch to be Jan 1, 1970,"
202  
-                "but your system says the epoch is ~p", [Epoch]),
  202
+                        "but your system says the epoch is ~p", [Epoch]),
203 203
             ok
204 204
     end.
205 205
 
206 206
 check_kv_health(_Pid) ->
207 207
     VNodes = riak_core_vnode_manager:all_index_pid(riak_kv_vnode),
208  
-    Threshold = app_helper:get_env(riak_kv, vnode_mailbox_limit, 5000),
  208
+    {Low, High} = app_helper:get_env(riak_kv, vnode_mailbox_limit, {1, 5000}),
  209
+    case lists:member(riak_kv, riak_core_node_watcher:services(node())) of
  210
+        true ->
  211
+            %% Service active, use high watermark
  212
+            Mode = enabled,
  213
+            Threshold = High;
  214
+        false ->
  215
+            %% Service disabled, use low watermark
  216
+            Mode = disabled,
  217
+            Threshold = Low
  218
+    end,
  219
+
209 220
     SlowVNs =
210 221
         [{Idx,Len} || {Idx, Pid} <- VNodes,
211 222
                       {message_queue_len, Len} <- process_info(Pid, [message_queue_len]),
212 223
                       Len > Threshold],
213  
-    SlowVNs =:= [].
  224
+    Passed = (SlowVNs =:= []),
  225
+
  226
+    case {Passed, Mode} of
  227
+        {false, enabled} ->
  228
+            lager:info("Disabling riak_kv due to large message queues. "
  229
+                       "Offending vnodes: ~p", [SlowVNs]);
  230
+        {true, disabled} ->
  231
+            lager:info("Re-enabling riak_kv after successful health check");
  232
+        _ ->
  233
+            ok
  234
+    end,
  235
+    Passed.

0 notes on commit 0767648

Please sign in to comment.
Something went wrong with that request. Please try again.