Permalink
Browse files

BOOKKEEPER-208: Separate write quorum from ack quorum (ivank)

git-svn-id: https://svn.apache.org/repos/asf/zookeeper/bookkeeper/trunk@1383872 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 26671da commit e353a8f93ad48d8785bbaa7162a8a9ebca40a15f @ivankelly ivankelly committed Sep 12, 2012
Showing with 606 additions and 117 deletions.
  1. +2 −0 CHANGES.txt
  2. +14 −7 bookkeeper-benchmark/src/main/java/org/apache/bookkeeper/benchmark/BenchThroughputLatency.java
  3. +73 −7 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/BookKeeper.java
  4. +25 −11 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/DistributionSchedule.java
  5. +2 −3 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerChecker.java
  6. +5 −2 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerCreateOp.java
  7. +10 −3 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerHandle.java
  8. +25 −11 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/LedgerMetadata.java
  9. +17 −32 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/PendingAddOp.java
  10. +3 −3 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/PendingReadOp.java
  11. +30 −15 bookkeeper-server/src/main/java/org/apache/bookkeeper/client/RoundRobinDistributionSchedule.java
  12. +2 −0 bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
  13. +70 −13 bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/DataFormats.java
  14. +2 −0 bookkeeper-server/src/main/proto/DataFormats.proto
  15. +1 −1 bookkeeper-server/src/test/java/org/apache/bookkeeper/client/BookieRecoveryTest.java
  16. +64 −0 bookkeeper-server/src/test/java/org/apache/bookkeeper/client/RoundRobinDistributionScheduleTest.java
  17. +208 −0 bookkeeper-server/src/test/java/org/apache/bookkeeper/client/SlowBookieTest.java
  18. +2 −2 bookkeeper-server/src/test/java/org/apache/bookkeeper/client/TestLedgerChecker.java
  19. +1 −2 bookkeeper-server/src/test/java/org/apache/bookkeeper/client/TestReadTimeout.java
  20. +1 −1 bookkeeper-server/src/test/java/org/apache/bookkeeper/meta/GcLedgersTest.java
  21. +49 −4 bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
View
@@ -132,6 +132,8 @@ Trunk (unreleased changes)
BOOKKEEPER-300: Create Bookie format command (Vinay via sijie)
+ BOOKKEEPER-208: Separate write quorum from ack quorum (ivank)
+
hedwig-server:
BOOKKEEPER-250: Need a ledger manager like interface to manage metadata operations in Hedwig (sijie via ivank)
@@ -78,7 +78,7 @@
}
}
- public BenchThroughputLatency(int ensemble, int qSize, byte[] passwd,
+ public BenchThroughputLatency(int ensemble, int writeQuorumSize, int ackQuorumSize, byte[] passwd,
int numberOfLedgers, int sendLimit, ClientConfiguration conf)
throws KeeperException, IOException, InterruptedException {
this.sem = new Semaphore(conf.getThrottleValue());
@@ -91,9 +91,11 @@ public BenchThroughputLatency(int ensemble, int qSize, byte[] passwd,
lh = new LedgerHandle[this.numberOfLedgers];
for(int i = 0; i < this.numberOfLedgers; i++) {
- lh[i] = bk.createLedger(ensemble, qSize, BookKeeper.DigestType.CRC32,
+ lh[i] = bk.createLedger(ensemble, writeQuorumSize,
+ ackQuorumSize,
+ BookKeeper.DigestType.CRC32,
passwd);
- LOG.info("Ledger Handle: " + lh[i].getId());
+ LOG.debug("Ledger Handle: " + lh[i].getId());
}
} catch (BKException e) {
e.printStackTrace();
@@ -233,6 +235,7 @@ public static void main(String[] args)
options.addOption("entrysize", true, "Entry size (bytes), default 1024");
options.addOption("ensemble", true, "Ensemble size, default 3");
options.addOption("quorum", true, "Quorum size, default 2");
+ options.addOption("ackQuorum", true, "Ack quorum size, default is same as quorum");
options.addOption("throttle", true, "Max outstanding requests, default 10000");
options.addOption("ledgers", true, "Number of ledgers, default 1");
options.addOption("zookeeper", true, "Zookeeper ensemble, default \"localhost:2181\"");
@@ -261,6 +264,10 @@ public static void main(String[] args)
int ledgers = Integer.valueOf(cmd.getOptionValue("ledgers", "1"));
int ensemble = Integer.valueOf(cmd.getOptionValue("ensemble", "3"));
int quorum = Integer.valueOf(cmd.getOptionValue("quorum", "2"));
+ int ackQuorum = quorum;
+ if (cmd.hasOption("ackQuorum")) {
+ ackQuorum = Integer.valueOf(cmd.getOptionValue("ackQuorum"));
+ }
int throttle = Integer.valueOf(cmd.getOptionValue("throttle", "10000"));
int sendLimit = Integer.valueOf(cmd.getOptionValue("sendlimit", "20000000"));
@@ -313,8 +320,8 @@ public void run() {
// Now do the benchmark
- BenchThroughputLatency bench = new BenchThroughputLatency(ensemble, quorum, passwd,
- ledgers, sendLimit, conf);
+ BenchThroughputLatency bench = new BenchThroughputLatency(ensemble, quorum, ackQuorum,
+ passwd, ledgers, sendLimit, conf);
bench.setEntryData(data);
thread = new Thread(bench);
ZooKeeper zk = null;
@@ -439,8 +446,8 @@ public void process(WatchedEvent event) {
}
}
- BenchThroughputLatency warmup = new BenchThroughputLatency(bookies, bookies, passwd,
- ledgers, 50000, conf);
+ BenchThroughputLatency warmup = new BenchThroughputLatency(bookies, bookies, bookies, passwd,
+ ledgers, 10000, conf);
warmup.setEntryData(data);
Thread thread = new Thread(warmup);
thread.start();
@@ -256,9 +256,47 @@ BookieClient getBookieClient() {
* authenticate access to a ledger, but also to verify entries in ledgers.
*
* @param ensSize
- * ensemble size
- * @param qSize
- * quorum size
+ * number of bookies over which to stripe entries
+ * @param writeQuorumSize
+ * number of bookies each entry will be written to. each of these bookies
+ * must acknowledge the entry before the call is completed.
+ * @param digestType
+ * digest type, either MAC or CRC32
+ * @param passwd
+ * password
+ * @param cb
+ * createCallback implementation
+ * @param ctx
+ * optional control object
+ */
+ public void asyncCreateLedger(final int ensSize,
+ final int writeQuorumSize,
+ final DigestType digestType,
+ final byte[] passwd, final CreateCallback cb, final Object ctx)
+ {
+ asyncCreateLedger(ensSize, writeQuorumSize, writeQuorumSize, digestType, passwd, cb, ctx);
+ }
+
+ /**
+ * Creates a new ledger asynchronously. Ledgers created with this call have
+ * a separate write quorum and ack quorum size. The write quorum must be larger than
+ * the ack quorum.
+ *
+ * Separating the write and the ack quorum allows the BookKeeper client to continue
+ * writing when a bookie has failed but the failure has not yet been detected. Detecting
+ * a bookie has failed can take a number of seconds, as configured by the read timeout
+ * {@link ClientConfiguration#getReadTimeout()}. Once the bookie failure is detected,
+ * that bookie will be removed from the ensemble.
+ *
+ * The other parameters match those of {@link #asyncCreateLedger(int, int, DigestType, byte[],
+ * AsyncCallback.CreateCallback, Object)}
+ *
+ * @param ensSize
+ * number of bookies over which to stripe entries
+ * @param writeQuorumSize
+ * number of bookies each entry will be written to
+ * @param ackQuorumSize
+ * number of bookies which must acknowledge an entry before the call is completed
* @param digestType
* digest type, either MAC or CRC32
* @param passwd
@@ -268,9 +306,17 @@ BookieClient getBookieClient() {
* @param ctx
* optional control object
*/
- public void asyncCreateLedger(final int ensSize, final int qSize, final DigestType digestType,
+
+ public void asyncCreateLedger(final int ensSize,
+ final int writeQuorumSize,
+ final int ackQuorumSize,
+ final DigestType digestType,
final byte[] passwd, final CreateCallback cb, final Object ctx) {
- new LedgerCreateOp(BookKeeper.this, ensSize, qSize, digestType, passwd, cb, ctx)
+ if (writeQuorumSize < ackQuorumSize) {
+ throw new IllegalArgumentException("Write quorum must be larger than ack quorum");
+ }
+ new LedgerCreateOp(BookKeeper.this, ensSize, writeQuorumSize,
+ ackQuorumSize, digestType, passwd, cb, ctx)
.initiate();
}
@@ -305,14 +351,34 @@ public LedgerHandle createLedger(DigestType digestType, byte passwd[])
* @throws BKException
*/
public LedgerHandle createLedger(int ensSize, int qSize,
- DigestType digestType, byte passwd[])
+ DigestType digestType, byte passwd[])
+ throws InterruptedException, BKException {
+ return createLedger(ensSize, qSize, qSize, digestType, passwd);
+ }
+
+ /**
+ * Synchronous call to create ledger. Parameters match those of
+ * {@link #asyncCreateLedger(int, int, int, DigestType, byte[],
+ * AsyncCallback.CreateCallback, Object)}
+ *
+ * @param ensSize
+ * @param writeQuorumSize
+ * @param ackQuorumSize
+ * @param digestType
+ * @param passwd
+ * @return a handle to the newly created ledger
+ * @throws InterruptedException
+ * @throws BKException
+ */
+ public LedgerHandle createLedger(int ensSize, int writeQuorumSize, int ackQuorumSize,
+ DigestType digestType, byte passwd[])
throws InterruptedException, BKException {
SyncCounter counter = new SyncCounter();
counter.inc();
/*
* Calls asynchronous version
*/
- asyncCreateLedger(ensSize, qSize, digestType, passwd,
+ asyncCreateLedger(ensSize, writeQuorumSize, ackQuorumSize, digestType, passwd,
new SyncCreateCallback(), counter);
/*
@@ -17,6 +17,7 @@
*/
package org.apache.bookkeeper.client;
+import java.util.List;
/**
* This interface determins how entries are distributed among bookies.
*
@@ -30,21 +31,34 @@
interface DistributionSchedule {
/**
- *
- * @param entryId
- * @param replicaIndex
- * @return index of bookie that should get this replica
+ * return the set of bookie indices to send the message to
*/
- public int getBookieIndex(long entryId, int replicaIndex);
+ public List<Integer> getWriteSet(long entryId);
/**
- *
- * @param entryId
- * @param bookieIndex
- * @return -1 if the given bookie index is not a replica for the given
- * entryId
+ * An ack set represents the set of bookies from which
+ * a response must be received so that an entry can be
+ * considered to be replicated on a quorum.
*/
- public int getReplicaIndex(long entryId, int bookieIndex);
+ public interface AckSet {
+ /**
+ * Add a bookie response and check if quorum has been met
+ * @return true if quorum has been met, false otherwise
+ */
+ public boolean addBookieAndCheck(int bookieIndexHeardFrom);
+
+ /**
+ * Invalidate a previous bookie response.
+ * Used for reissuing write requests.
+ */
+ public void removeBookie(int bookie);
+ }
+
+ /**
+ * Returns an ackset object, responses should be checked against this
+ */
+ public AckSet getAckSet();
+
/**
* Interface to keep track of which bookies in an ensemble, an action
@@ -222,7 +222,7 @@ public void checkLedger(LedgerHandle lh,
final long entryToRead = curEntryId;
EntryExistsCallback eecb
- = new EntryExistsCallback(lh.getLedgerMetadata().getQuorumSize(),
+ = new EntryExistsCallback(lh.getLedgerMetadata().getWriteQuorumSize(),
new GenericCallback<Boolean>() {
public void operationComplete(int rc, Boolean result) {
if (result) {
@@ -232,8 +232,7 @@ public void operationComplete(int rc, Boolean result) {
}
});
- for (int i = 0; i < lh.getLedgerMetadata().getQuorumSize(); i++) {
- int bi = lh.getDistributionSchedule().getBookieIndex(entryToRead, i);
+ for (int bi : lh.getDistributionSchedule().getWriteSet(entryToRead)) {
InetSocketAddress addr = curEnsemble.get(bi);
bookieClient.readEntry(addr, lh.getId(),
entryToRead, eecb, null);
@@ -68,9 +68,12 @@
* optional control object
*/
- LedgerCreateOp(BookKeeper bk, int ensembleSize, int quorumSize, DigestType digestType, byte[] passwd, CreateCallback cb, Object ctx) {
+ LedgerCreateOp(BookKeeper bk, int ensembleSize,
+ int writeQuorumSize, int ackQuorumSize,
+ DigestType digestType,
+ byte[] passwd, CreateCallback cb, Object ctx) {
this.bk = bk;
- this.metadata = new LedgerMetadata(ensembleSize, quorumSize, digestType, passwd);
+ this.metadata = new LedgerMetadata(ensembleSize, writeQuorumSize, ackQuorumSize, digestType, passwd);
this.digestType = digestType;
this.passwd = passwd;
this.cb = cb;
@@ -29,6 +29,7 @@
import java.util.Enumeration;
import java.util.Queue;
import java.util.concurrent.Semaphore;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.bookkeeper.client.AsyncCallback.ReadLastConfirmedCallback;
import org.apache.bookkeeper.client.BKException;
@@ -74,6 +75,7 @@
*/
final static public long INVALID_ENTRY_ID = BookieProtocol.INVALID_ENTRY_ID;
+ final AtomicInteger blockAddCompletions = new AtomicInteger(0);
final Queue<PendingAddOp> pendingAddOps = new ConcurrentLinkedQueue<PendingAddOp>();
LedgerHandle(BookKeeper bk, long ledgerId, LedgerMetadata metadata,
@@ -98,7 +100,7 @@
macManager = DigestManager.instantiate(ledgerId, password, digestType);
this.ledgerKey = MacDigestManager.genDigest("ledger", password);
distributionSchedule = new RoundRobinDistributionSchedule(
- metadata.getQuorumSize(), metadata.getEnsembleSize());
+ metadata.getWriteQuorumSize(), metadata.getAckQuorumSize(), metadata.getEnsembleSize());
}
/**
@@ -641,8 +643,9 @@ void sendAddSuccessCallbacks() {
// Start from the head of the queue and proceed while there are
// entries that have had all their responses come back
PendingAddOp pendingAddOp;
- while ((pendingAddOp = pendingAddOps.peek()) != null) {
- if (pendingAddOp.numResponsesPending != 0) {
+ while ((pendingAddOp = pendingAddOps.peek()) != null
+ && blockAddCompletions.get() == 0) {
+ if (!pendingAddOp.completed) {
return;
}
pendingAddOps.remove();
@@ -660,6 +663,7 @@ void handleBookieFailure(final InetSocketAddress addr, final int bookieIndex) {
+ bookieIndex);
}
final ArrayList<InetSocketAddress> newEnsemble = new ArrayList<InetSocketAddress>();
+ blockAddCompletions.incrementAndGet();
final long newEnsembleStartEntry = lastAddConfirmed + 1;
// avoid parallel ensemble changes to same ensemble.
@@ -735,6 +739,8 @@ public void safeRun() {
handleUnrecoverableErrorDuringAdd(rc);
return;
}
+ blockAddCompletions.decrementAndGet();
+
// the failed bookie has been replaced
unsetSuccessAndSendWriteRequest(ensembleInfo.bookieIndex);
}
@@ -815,6 +821,7 @@ private boolean resolveConflict(LedgerMetadata newMeta) {
}
} else {
// the failed bookie has been replaced
+ blockAddCompletions.decrementAndGet();
unsetSuccessAndSendWriteRequest(ensembleInfo.bookieIndex);
}
return true;
Oops, something went wrong.

0 comments on commit e353a8f

Please sign in to comment.