-
Notifications
You must be signed in to change notification settings - Fork 470
/
SEQUENCER.java
663 lines (553 loc) · 22.7 KB
/
SEQUENCER.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
package org.jgroups.protocols;
import org.jgroups.*;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.annotations.ManagedOperation;
import org.jgroups.annotations.Property;
import org.jgroups.stack.Protocol;
import org.jgroups.util.*;
import java.io.DataInput;
import java.io.DataOutput;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Objects;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Supplier;
/**
* Implementation of total order protocol using a sequencer.
* Consult <a href="https://github.com/belaban/JGroups/blob/master/doc/design/SEQUENCER.txt">SEQUENCER.txt</a> for details
* @author Bela Ban
*/
@MBean(description="Implementation of total order protocol using a sequencer")
public class SEQUENCER extends Protocol {
protected Address local_addr;
protected volatile Address coord;
protected volatile View view;
@ManagedAttribute
protected volatile boolean is_coord;
protected final AtomicLong seqno=new AtomicLong(0);
/** Maintains messages forwarded to the coord which which no ack has been received yet.
* Needs to be sorted so we resend them in the right order
*/
protected final NavigableMap<Long,Message> forward_table=new ConcurrentSkipListMap<>();
protected final Lock send_lock=new ReentrantLock();
protected final Condition send_cond=send_lock.newCondition();
/** When ack_mode is set, we need to wait for an ack for each forwarded message until we can send the next one */
protected volatile boolean ack_mode=true;
/** Set when we block all sending threads to resend all messages from forward_table */
protected volatile boolean flushing=false;
protected volatile boolean running=true;
/** Keeps track of the threads sending messages */
protected final AtomicInteger in_flight_sends=new AtomicInteger(0);
// Maintains received seqnos, so we can weed out dupes
protected final ConcurrentMap<Address,BoundedHashMap<Long,Long>> delivery_table=Util.createConcurrentMap();
protected volatile Flusher flusher;
/** Used for each resent message to wait until the message has been received */
protected final Promise<Long> ack_promise=new Promise<>();
@Property(description="Size of the set to store received seqnos (for duplicate checking)")
protected int delivery_table_max_size=2000;
@Property(description="Number of acks needed before going from ack-mode to normal mode. " +
"0 disables this, which means that ack-mode is always on")
protected int threshold=10;
@ManagedAttribute protected int num_acks;
@ManagedAttribute protected long forwarded_msgs;
@ManagedAttribute protected long bcast_msgs;
@ManagedAttribute protected long received_forwards;
@ManagedAttribute protected long received_bcasts;
@ManagedAttribute protected long delivered_bcasts;
@ManagedAttribute
public boolean isCoordinator() {return is_coord;}
public Address getCoordinator() {return coord;}
public Address getLocalAddress() {return local_addr;}
@ManagedAttribute(description="Number of messages in the forward-table")
public int getForwardTableSize() {return forward_table.size();}
public void setThreshold(int new_threshold) {this.threshold=new_threshold;}
public void setDeliveryTableMaxSize(int size) {delivery_table_max_size=size;}
@ManagedOperation
public void resetStats() {
forwarded_msgs=bcast_msgs=received_forwards=received_bcasts=delivered_bcasts=0L;
}
public void start() throws Exception {
super.start();
running=true;
ack_mode=true;
}
public void stop() {
running=false;
unblockAll();
stopFlusher();
super.stop();
}
public Object down(Event evt) {
switch(evt.getType()) {
case Event.VIEW_CHANGE:
handleViewChange(evt.getArg());
break;
case Event.TMP_VIEW:
handleTmpView(evt.getArg());
break;
case Event.SET_LOCAL_ADDRESS:
local_addr=evt.getArg();
break;
}
return down_prot.down(evt);
}
public Object down(Message msg) {
if(msg.getDest() != null || msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB))
return down_prot.down(msg);
if(msg.getSrc() == null)
msg.setSrc(local_addr);
if(flushing)
block();
// A seqno is not used to establish ordering, but only to weed out duplicates; next_seqno doesn't need
// to increase monotonically, but only to be unique (https://issues.jboss.org/browse/JGRP-1461) !
long next_seqno=seqno.incrementAndGet();
in_flight_sends.incrementAndGet();
try {
SequencerHeader hdr=new SequencerHeader(is_coord? SequencerHeader.BCAST : SequencerHeader.WRAPPED_BCAST, next_seqno);
msg.putHeader(this.id, hdr);
if(log.isTraceEnabled())
log.trace("[" + local_addr + "]: forwarding " + local_addr + "::" + seqno + " to coord " + coord);
// We always forward messages to the coordinator, even if we're the coordinator. Having the coord
// send its messages directly led to starvation of messages from other members. MPerf perf went up
// from 20MB/sec/node to 50MB/sec/node with this change !
forwardToCoord(next_seqno, msg);
}
catch(Exception ex) {
log.error(Util.getMessage("FailedSendingMessage"), ex);
}
finally {
in_flight_sends.decrementAndGet();
}
return null; // don't pass down
}
public Object up(Event evt) {
switch(evt.getType()) {
case Event.VIEW_CHANGE:
Object retval=up_prot.up(evt);
handleViewChange(evt.getArg());
return retval;
case Event.TMP_VIEW:
handleTmpView(evt.getArg());
break;
}
return up_prot.up(evt);
}
public Object up(Message msg) {
SequencerHeader hdr;
if(msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB))
return up_prot.up(msg);
hdr=msg.getHeader(this.id);
if(hdr == null)
return up_prot.up(msg); // pass up
switch(hdr.type) {
case SequencerHeader.FORWARD:
case SequencerHeader.FLUSH:
if(!is_coord) {
if(log.isErrorEnabled())
log.error(local_addr + ": non-coord; dropping FORWARD request from " + msg.getSrc());
return null;
}
Address sender=msg.getSrc();
if(view != null && !view.containsMember(sender)) {
if(log.isErrorEnabled())
log.error(local_addr + ": dropping FORWARD request from non-member " + sender +
"; view=" + view);
return null;
}
broadcast(msg, true, msg.getSrc(), hdr.seqno, hdr.type == SequencerHeader.FLUSH); // do copy the message
received_forwards++;
break;
case SequencerHeader.BCAST:
deliver(msg, hdr);
received_bcasts++;
break;
case SequencerHeader.WRAPPED_BCAST:
unwrapAndDeliver(msg, hdr.flush_ack); // unwrap the original message (in the payload) and deliver it
received_bcasts++;
break;
}
return null;
}
public void up(MessageBatch batch) {
for(Message msg: batch) {
if(msg.isFlagSet(Message.Flag.NO_TOTAL_ORDER) || msg.isFlagSet(Message.Flag.OOB) || msg.getHeader(id) == null)
continue;
batch.remove(msg);
// simplistic implementation
try {
up(msg);
}
catch(Throwable t) {
log.error(Util.getMessage("FailedPassingUpMessage"), t);
}
}
if(!batch.isEmpty())
up_prot.up(batch);
}
/* --------------------------------- Private Methods ----------------------------------- */
protected void handleViewChange(View v) {
List<Address> mbrs=v.getMembers();
if(mbrs.isEmpty()) return;
if(view == null || view.compareTo(v) < 0)
view=v;
else
return;
delivery_table.keySet().retainAll(mbrs);
Address existing_coord=coord, new_coord=mbrs.get(0);
boolean coord_changed=!Objects.equals(existing_coord, new_coord);
if(coord_changed && new_coord != null) {
stopFlusher();
startFlusher(new_coord); // needs to be done in the background, to prevent blocking if down() would block
}
}
protected void flush(final Address new_coord) throws InterruptedException {
// wait until all threads currently sending messages have returned (new threads after flushing=true) will block
// flushing is set to true in startFlusher()
while(flushing && running) {
if(in_flight_sends.get() == 0)
break;
Thread.sleep(100);
}
send_lock.lockInterruptibly();
try {
if(log.isTraceEnabled())
log.trace(local_addr + ": coord changed from " + coord + " to " + new_coord);
coord=new_coord;
is_coord=Objects.equals(local_addr, coord);
flushMessagesInForwardTable();
}
finally {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing completed");
flushing=false;
ack_mode=true; // go to ack-mode after flushing
num_acks=0;
send_cond.signalAll();
send_lock.unlock();
}
}
// If we're becoming coordinator, we need to handle TMP_VIEW as
// an immediate change of view. See JGRP-1452.
private void handleTmpView(View v) {
List<Address> mbrs=v.getMembers();
if(mbrs.isEmpty()) return;
Address new_coord=mbrs.get(0);
if(!new_coord.equals(coord) && local_addr != null && local_addr.equals(new_coord))
handleViewChange(v);
}
/**
* Sends all messages currently in forward_table to the new coordinator (changing the dest field).
* This needs to be done, so the underlying reliable unicast protocol (e.g. UNICAST) adds these messages
* to its retransmission mechanism<br/>
* Note that we need to resend the messages in order of their seqnos ! We also need to prevent other message
* from being inserted until we're done, that's why there's synchronization.<br/>
* Access to the forward_table doesn't need to be synchronized as there won't be any insertions during flushing
* (all down-threads are blocked)
*/
protected void flushMessagesInForwardTable() {
if(is_coord) {
for(Map.Entry<Long,Message> entry: forward_table.entrySet()) {
Long key=entry.getKey();
Message msg=entry.getValue();
Buffer buf;
try {
buf=Util.streamableToBuffer(msg);
}
catch(Exception e) {
log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
continue;
}
SequencerHeader hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, key);
Message forward_msg=new Message(null, buf).putHeader(this.id, hdr);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (broadcasting) " + local_addr + "::" + key);
down_prot.down(forward_msg);
}
return;
}
// for forwarded messages, we need to receive the forwarded message from the coordinator, to prevent this case:
// - V1={A,B,C}
// - A crashes
// - C installs V2={B,C}
// - C forwards messages 3 and 4 to B (the new coord)
// - B drops 3 because its view is still V1
// - B installs V2
// - B receives message 4 and broadcasts it
// ==> C's message 4 is delivered *before* message 3 !
// ==> By resending 3 until it is received, then resending 4 until it is received, we make sure this won't happen
// (see https://issues.jboss.org/browse/JGRP-1449)
while(flushing && running && !forward_table.isEmpty()) {
Map.Entry<Long,Message> entry=forward_table.firstEntry();
final Long key=entry.getKey();
Message msg=entry.getValue();
Buffer buf;
try {
buf=Util.streamableToBuffer(msg);
}
catch(Exception e) {
log.error(Util.getMessage("FlushingBroadcastingFailed"), e);
continue;
}
while(flushing && running && !forward_table.isEmpty()) {
SequencerHeader hdr=new SequencerHeader(SequencerHeader.FLUSH, key);
Message forward_msg=new Message(coord, buf).putHeader(this.id,hdr).setFlag(Message.Flag.DONT_BUNDLE);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (forwarding) " + local_addr + "::" + key + " to coord " + coord);
ack_promise.reset();
down_prot.down(forward_msg);
Long ack=ack_promise.getResult(500);
if((Objects.equals(ack, key)) || !forward_table.containsKey(key))
break;
}
}
}
protected void forwardToCoord(long seqno, Message msg) {
if(is_coord) {
forward(msg, seqno, false);
return;
}
if(!running || flushing) {
forward_table.put(seqno, msg);
return;
}
if(!ack_mode) {
forward_table.put(seqno, msg);
forward(msg, seqno, false);
return;
}
send_lock.lock();
try {
forward_table.put(seqno, msg);
while(running && !flushing) {
ack_promise.reset();
forward(msg, seqno, true);
if(!ack_mode || !running || flushing)
break;
Long ack=ack_promise.getResult(500);
if((Objects.equals(ack, seqno)) || !forward_table.containsKey(seqno))
break;
}
}
finally {
send_lock.unlock();
}
}
protected void forward(final Message msg, long seqno, boolean flush) {
Address target=coord;
if(target == null)
return;
byte type=flush? SequencerHeader.FLUSH : SequencerHeader.FORWARD;
try {
SequencerHeader hdr=new SequencerHeader(type, seqno);
Message forward_msg=new Message(target, Util.streamableToBuffer(msg)).putHeader(this.id,hdr);
down_prot.down(forward_msg);
forwarded_msgs++;
}
catch(Exception ex) {
log.error(Util.getMessage("FailedForwardingMessageTo") + msg.getDest(), ex);
}
}
protected void broadcast(final Message msg, boolean copy, Address original_sender, long seqno, boolean resend) {
Message bcast_msg=null;
if(!copy) {
bcast_msg=msg; // no need to add a header, message already has one
}
else {
SequencerHeader new_hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, seqno);
bcast_msg=new Message(null, msg.getRawBuffer(), msg.getOffset(), msg.getLength()).putHeader(this.id, new_hdr);
if(resend) {
new_hdr.flush_ack=true;
bcast_msg.setFlag(Message.Flag.DONT_BUNDLE);
}
}
if(log.isTraceEnabled())
log.trace(local_addr + ": broadcasting " + original_sender + "::" + seqno);
down_prot.down(bcast_msg);
bcast_msgs++;
}
/**
* Unmarshal the original message (in the payload) and then pass it up (unless already delivered)
* @param msg
*/
protected void unwrapAndDeliver(final Message msg, boolean flush_ack) {
try {
Message msg_to_deliver=Util.streamableFromBuffer(Message.class, msg.getRawBuffer(), msg.getOffset(), msg.getLength());
SequencerHeader hdr=msg_to_deliver.getHeader(this.id);
if(flush_ack)
hdr.flush_ack=true;
deliver(msg_to_deliver, hdr);
}
catch(Exception ex) {
log.error(Util.getMessage("FailureUnmarshallingBuffer"), ex);
}
}
protected void deliver(Message msg, SequencerHeader hdr) {
Address sender=msg.getSrc();
if(sender == null) {
if(log.isErrorEnabled())
log.error(local_addr + ": sender is null, cannot deliver " + "::" + hdr.getSeqno());
return;
}
long msg_seqno=hdr.getSeqno();
if(sender.equals(local_addr)) {
forward_table.remove(msg_seqno);
if(hdr.flush_ack) {
ack_promise.setResult(msg_seqno);
if(ack_mode && !flushing && threshold > 0 && ++num_acks >= threshold) {
ack_mode=false;
num_acks=0;
}
}
}
if(!canDeliver(sender, msg_seqno)) {
if(log.isWarnEnabled())
log.warn(local_addr + ": dropped duplicate message " + sender + "::" + msg_seqno);
return;
}
if(log.isTraceEnabled())
log.trace(local_addr + ": delivering " + sender + "::" + msg_seqno);
up_prot.up(msg);
delivered_bcasts++;
}
/**
* Checks if seqno has already been received from sender. This weeds out duplicates.
* Note that this method is never called concurrently for the same sender, as the sender in NAKACK will always be
* the coordinator.
*/
protected boolean canDeliver(Address sender, long seqno) {
BoundedHashMap<Long,Long> seqno_set=delivery_table.get(sender);
if(seqno_set == null) {
seqno_set=new BoundedHashMap<>(delivery_table_max_size);
BoundedHashMap<Long,Long> existing=delivery_table.put(sender,seqno_set);
if(existing != null)
seqno_set=existing;
}
return seqno_set.add(seqno, seqno);
}
protected void block() {
send_lock.lock();
try {
while(flushing && running) {
try {
send_cond.await();
}
catch(InterruptedException e) {
}
}
}
finally {
send_lock.unlock();
}
}
protected void unblockAll() {
flushing=false;
send_lock.lock();
try {
send_cond.signalAll();
ack_promise.setResult(null);
}
finally {
send_lock.unlock();
}
}
protected synchronized void startFlusher(final Address new_coord) {
if(flusher == null || !flusher.isAlive()) {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing started");
// causes subsequent message sends (broadcasts and forwards) to block (https://issues.jboss.org/browse/JGRP-1495)
flushing=true;
flusher=new Flusher(new_coord);
flusher.setName("Flusher");
flusher.start();
}
}
protected void stopFlusher() {
flushing=false;
Thread tmp=flusher;
while(tmp != null && tmp.isAlive()) {
tmp.interrupt();
ack_promise.setResult(null);
try {
tmp.join();
}
catch(InterruptedException e) {
}
}
}
/* ----------------------------- End of Private Methods -------------------------------- */
protected class Flusher extends Thread {
protected final Address new_coord;
public Flusher(Address new_coord) {
this.new_coord=new_coord;
}
public void run() {
try {
flush(new_coord);
}
catch (InterruptedException e) {
}
}
}
public static class SequencerHeader extends Header {
protected static final byte FORWARD = 1;
protected static final byte FLUSH = 2;
protected static final byte BCAST = 3;
protected static final byte WRAPPED_BCAST = 4;
protected byte type=-1;
protected long seqno=-1;
protected boolean flush_ack;
public SequencerHeader() {
}
public SequencerHeader(byte type) {
this.type=type;
}
public SequencerHeader(byte type, long seqno) {
this(type);
this.seqno=seqno;
}
public short getMagicId() {return 61;}
public long getSeqno() {
return seqno;
}
public Supplier<? extends Header> create() {return SequencerHeader::new;}
public String toString() {
StringBuilder sb=new StringBuilder(64);
sb.append(printType());
if(seqno >= 0)
sb.append(" seqno=" + seqno);
if(flush_ack)
sb.append(" (flush_ack)");
return sb.toString();
}
protected final String printType() {
switch(type) {
case FORWARD: return "FORWARD";
case FLUSH: return "FLUSH";
case BCAST: return "BCAST";
case WRAPPED_BCAST: return "WRAPPED_BCAST";
default: return "n/a";
}
}
public void writeTo(DataOutput out) throws Exception {
out.writeByte(type);
Bits.writeLong(seqno,out);
out.writeBoolean(flush_ack);
}
public void readFrom(DataInput in) throws Exception {
type=in.readByte();
seqno=Bits.readLong(in);
flush_ack=in.readBoolean();
}
public int serializedSize() {
return Global.BYTE_SIZE + Bits.size(seqno) + Global.BYTE_SIZE; // type + seqno + flush_ack
}
}
}