-
Notifications
You must be signed in to change notification settings - Fork 470
/
FD_ALL.java
450 lines (369 loc) · 16.1 KB
/
FD_ALL.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
package org.jgroups.protocols;
import org.jgroups.*;
import org.jgroups.annotations.*;
import org.jgroups.stack.Protocol;
import org.jgroups.util.*;
import java.io.DataInput;
import java.io.DataOutput;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Supplier;
/**
* Failure detection based on simple heartbeat protocol. Every member periodically multicasts a heartbeat.
* Every member also maintains a table of all members (minus itself). When data or a heartbeat from P is received,
* we reset the timestamp for P to the current time. Periodically, we check for expired members, and suspect those.</p>
* Reduced number of messages exchanged on suspect event: https://jira.jboss.org/browse/JGRP-1241
*
* @author Bela Ban
*/
@MBean(description="Failure detection based on simple heartbeat protocol")
public class FD_ALL extends Protocol {
/* ----------------------------------------- Properties -------------------------------------------------- */
@Property(description="Interval at which a HEARTBEAT is sent to the cluster")
protected long interval=8000;
@Property(description="Timeout after which a node P is suspected if neither a heartbeat nor data were received from P")
protected long timeout=40000;
@Property(description="Interval at which the HEARTBEAT timeouts are checked")
protected long timeout_check_interval=2000;
@Property(description="Treat messages received from members as heartbeats. Note that this means we're updating " +
"a value in a hashmap every time a message is passing up the stack through FD_ALL, which is costly. Default is false")
protected boolean msg_counts_as_heartbeat=false;
@Property(description="Uses TimeService to get the current time rather than System.currentTimeMillis. Might get " +
"removed soon, don't use !")
protected boolean use_time_service=true;
/* --------------------------------------------- JMX ------------------------------------------------------ */
@ManagedAttribute(description="Number of heartbeats sent")
protected int num_heartbeats_sent;
@ManagedAttribute(description="Number of heartbeats received")
protected int num_heartbeats_received;
@ManagedAttribute(description="Number of suspected events received")
protected int num_suspect_events;
/* --------------------------------------------- Fields ------------------------------------------------------ */
// Map of addresses and timestamps of last updates (ns)
protected final ConcurrentMap<Address, Long> timestamps=Util.createConcurrentMap();
protected Address local_addr;
protected final List<Address> members=new ArrayList<>();
protected final Set<Address> suspected_mbrs=new HashSet<>();
@ManagedAttribute(description="Shows whether there are currently any suspected members")
protected volatile boolean has_suspected_mbrs;
protected TimeScheduler timer;
protected TimeService time_service;
// task which multicasts HEARTBEAT message after 'interval' ms
@GuardedBy("lock")
protected Future<?> heartbeat_sender_future;
// task which checks for members exceeding timeout and suspects them
@GuardedBy("lock")
protected Future<?> timeout_checker_future;
protected final BoundedList<Tuple<Address,Long>> suspect_history=new BoundedList<>(20);
protected final Lock lock=new ReentrantLock();
public FD_ALL() {}
@ManagedAttribute(description="Member address")
public String getLocalAddress() {return local_addr != null? local_addr.toString() : "null";}
@ManagedAttribute(description="Lists members of a cluster")
public String getMembers() {return Util.printListWithDelimiter(members, ",");}
@ManagedAttribute(description="Currently suspected members")
public synchronized String getSuspectedMembers() {return suspected_mbrs.toString();}
public int getHeartbeatsSent() {return num_heartbeats_sent;}
public int getHeartbeatsReceived() {return num_heartbeats_received;}
public int getSuspectEventsSent() {return num_suspect_events;}
public long getTimeout() {return timeout;}
public void setTimeout(long timeout) {this.timeout=timeout;}
public long getTimeoutCheckInterval() {return timeout_check_interval;}
public void setTimeoutCheckInterval(long timeout_check_interval) {this.timeout_check_interval=timeout_check_interval;}
public long getInterval() {return interval;}
public void setInterval(long interval) {this.interval=interval;}
@ManagedAttribute(description="Are heartbeat tasks running")
public boolean isRunning() {
lock.lock();
try{
return isTimeoutCheckerRunning() && isHeartbeatSenderRunning();
}
finally{
lock.unlock();
}
}
@ManagedOperation(description="Prints suspect history")
public String printSuspectHistory() {
StringBuilder sb=new StringBuilder();
for(Tuple<Address,Long> tmp: suspect_history) {
sb.append(new Date(tmp.getVal2())).append(": ").append(tmp.getVal1()).append("\n");
}
return sb.toString();
}
@ManagedOperation(description="Prints timestamps")
public String printTimestamps() {
return _printTimestamps();
}
@ManagedOperation(description="Stops checking for crashed members")
public void stopFailureDetection() {
stopTimeoutChecker();
}
@ManagedOperation(description="Resumes checking for crashed members")
public void startFailureDetection() {
startTimeoutChecker();
}
public void resetStats() {
num_heartbeats_sent=num_heartbeats_received=num_suspect_events=0;
suspect_history.clear();
}
public void init() throws Exception {
timer=getTransport().getTimer();
if(timer == null)
throw new Exception("timer not set");
time_service=getTransport().getTimeService();
if(time_service == null)
log.warn("%s: time service is not available, using System.currentTimeMillis() instead", local_addr);
else {
if(time_service.interval() > timeout) {
log.warn("%s: interval of time service (%d) is greater than timeout (%d), disabling time service",
local_addr, time_service.interval(), timeout);
use_time_service=false;
}
}
suspected_mbrs.clear();
has_suspected_mbrs=false;
}
public synchronized void stop() {
stopHeartbeatSender();
stopTimeoutChecker();
suspected_mbrs.clear();
has_suspected_mbrs=false;
}
public Object up(Message msg) {
Address sender=msg.getSrc();
Header hdr=msg.getHeader(this.id);
if(hdr != null) {
update(sender); // updates the heartbeat entry for 'sender'
num_heartbeats_received++;
unsuspect(sender);
return null; // consume heartbeat message, do not pass to the layer above
}
else if(msg_counts_as_heartbeat) {
// message did not originate from FD_ALL layer, but still count as heartbeat
update(sender); // update when data is received too ? maybe a bit costly
if(has_suspected_mbrs)
unsuspect(sender);
}
return up_prot.up(msg); // pass up to the layer above us
}
public void up(MessageBatch batch) {
Collection<Message> msgs=batch.getMatchingMessages(id, true);
if((msgs != null && !msgs.isEmpty()) || msg_counts_as_heartbeat) {
update(batch.sender());
num_heartbeats_received++;
if(has_suspected_mbrs)
unsuspect(batch.sender());
}
if(!batch.isEmpty())
up_prot.up(batch);
}
public Object down(Event evt) {
switch(evt.getType()) {
case Event.VIEW_CHANGE:
down_prot.down(evt);
View v=evt.getArg();
handleViewChange(v);
return null;
case Event.SET_LOCAL_ADDRESS:
local_addr=evt.getArg();
break;
case Event.UNSUSPECT:
Address mbr=evt.getArg();
unsuspect(mbr);
update(mbr);
break;
}
return down_prot.down(evt);
}
protected void startTimeoutChecker() {
lock.lock();
try {
if(!isTimeoutCheckerRunning()) {
timeout_checker_future=timer.scheduleWithFixedDelay(new TimeoutChecker(),timeout_check_interval,timeout_check_interval, TimeUnit.MILLISECONDS, false);
}
}
finally {
lock.unlock();
}
}
protected void stopTimeoutChecker() {
lock.lock();
try {
if(timeout_checker_future != null) {
timeout_checker_future.cancel(true);
timeout_checker_future=null;
}
}
finally {
lock.unlock();
}
}
protected void startHeartbeatSender() {
lock.lock();
try {
if(!isHeartbeatSenderRunning())
heartbeat_sender_future=timer.scheduleWithFixedDelay(new HeartbeatSender(), interval, interval, TimeUnit.MILLISECONDS,
getTransport() instanceof TCP);
}
finally {
lock.unlock();
}
}
protected void stopHeartbeatSender() {
lock.lock();
try {
if(heartbeat_sender_future != null) {
heartbeat_sender_future.cancel(true);
heartbeat_sender_future=null;
}
}
finally {
lock.unlock();
}
}
protected boolean isTimeoutCheckerRunning() {
return timeout_checker_future != null && !timeout_checker_future.isDone();
}
protected boolean isHeartbeatSenderRunning() {
return heartbeat_sender_future != null && !heartbeat_sender_future.isDone();
}
protected void update(Address sender) {
if(sender != null && !sender.equals(local_addr))
timestamps.put(sender, getTimestamp());
}
protected void addIfAbsent(Address mbr) {
if(mbr != null && !mbr.equals(local_addr))
timestamps.putIfAbsent(mbr, getTimestamp());
}
protected long getTimestamp() {
return use_time_service && time_service != null? time_service.timestamp() : System.nanoTime();
}
protected void handleViewChange(View v) {
List<Address> mbrs=v.getMembers();
synchronized(this) {
members.clear();
members.addAll(mbrs);
if(suspected_mbrs.retainAll(mbrs))
has_suspected_mbrs=!suspected_mbrs.isEmpty();
timestamps.keySet().retainAll(mbrs);
}
mbrs.forEach(this::addIfAbsent);
if(mbrs.size() > 1) {
startHeartbeatSender();
startTimeoutChecker();
}
else {
stopHeartbeatSender();
stopTimeoutChecker();
}
}
protected String _printTimestamps() {
StringBuilder sb=new StringBuilder();
long current_time=getTimestamp();
for(Iterator<Entry<Address,Long>> it=timestamps.entrySet().iterator(); it.hasNext();) {
Entry<Address,Long> entry=it.next();
sb.append(entry.getKey()).append(": ");
sb.append(TimeUnit.SECONDS.convert (current_time - entry.getValue(), TimeUnit.NANOSECONDS)).append(" secs old\n");
}
return sb.toString();
}
protected void suspect(List<Address> suspects) {
if(suspects == null || suspects.isEmpty())
return;
num_suspect_events+=suspects.size();
final List<Address> eligible_mbrs=new ArrayList<>();
synchronized(this) {
for(Address suspect: suspects) {
suspect_history.add(new Tuple<>(suspect, System.currentTimeMillis())); // need wall clock time
suspected_mbrs.add(suspect);
}
eligible_mbrs.addAll(members);
eligible_mbrs.removeAll(suspected_mbrs);
has_suspected_mbrs=!suspected_mbrs.isEmpty();
}
// Check if we're coord, then send up the stack
if(local_addr != null && !eligible_mbrs.isEmpty()) {
Address first=eligible_mbrs.get(0);
if(local_addr.equals(first)) {
log.debug("suspecting " + getSuspectedMembers());
for(Address suspect: suspects) {
up_prot.up(new Event(Event.SUSPECT, suspect));
down_prot.down(new Event(Event.SUSPECT, suspect));
}
}
}
}
/**
* Removes mbr from suspected_mbrs and sends a UNSUSPECT event up and down the stack
* @param mbr
* @return true if the member was removed from suspected_mbrs, otherwise false
*/
protected boolean unsuspect(Address mbr) {
if(mbr == null) return false;
boolean do_unsuspect;
synchronized(this) {
do_unsuspect=!suspected_mbrs.isEmpty() && suspected_mbrs.remove(mbr);
if(do_unsuspect)
has_suspected_mbrs=!suspected_mbrs.isEmpty();
}
if(do_unsuspect) {
up_prot.up(new Event(Event.UNSUSPECT, mbr));
down_prot.down(new Event(Event.UNSUSPECT, mbr));
}
return do_unsuspect;
}
public static class HeartbeatHeader extends Header {
public HeartbeatHeader() {}
public String toString() {return "heartbeat";}
public short getMagicId() {return 62;}
public Supplier<? extends Header> create() {return HeartbeatHeader::new;}
public int serializedSize() {return 0;}
public void writeTo(DataOutput out) throws Exception {}
public void readFrom(DataInput in) throws Exception {}
}
/**
* Class which periodically multicasts a HEARTBEAT message to the cluster
*/
class HeartbeatSender implements Runnable {
public void run() {
Message heartbeat=new Message().setFlag(Message.Flag.INTERNAL).putHeader(id, new HeartbeatHeader());
down_prot.down(heartbeat);
num_heartbeats_sent++;
}
public String toString() {
return FD_ALL.class.getSimpleName() + ": " + getClass().getSimpleName();
}
}
class TimeoutChecker implements Runnable {
public void run() {
List<Address> suspects=new LinkedList<>();
long current_time=getTimestamp(), diff;
for(Iterator<Entry<Address,Long>> it=timestamps.entrySet().iterator(); it.hasNext();) {
Entry<Address,Long> entry=it.next();
Address key=entry.getKey();
Long val=entry.getValue();
if(val == null) {
it.remove();
continue;
}
diff=TimeUnit.MILLISECONDS.convert(current_time - val, TimeUnit.NANOSECONDS);
if(diff > timeout) {
log.debug("haven't received a heartbeat from " + key + " for " + diff +
" ms, adding it to suspect list");
suspects.add(key);
}
}
if(!suspects.isEmpty())
suspect(suspects);
}
public String toString() {
return FD_ALL.class.getSimpleName() + ": " + getClass().getSimpleName() +
" (interval=" + timeout_check_interval + " ms)";
}
}
}