From 18c1eebc08f93055ffdef1812247b439c8404163 Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Thu, 30 May 2019 16:17:45 +0530 Subject: [PATCH] HDDS-1502. Add metrics for Ozone Ratis performance.Contributed by Shashikant Banerjee(#833). --- .../transport/server/ratis/CSMMetrics.java | 58 +++++++++++++++++-- .../server/ratis/ContainerStateMachine.java | 16 ++++- .../server/ratis/TestCSMMetrics.java | 28 +++++---- 3 files changed, 82 insertions(+), 20 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 9ccf88ac77763..1ae3c53d909f4 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -37,13 +37,18 @@ public class CSMMetrics { // ratis op metrics metrics private @Metric MutableCounterLong numWriteStateMachineOps; - private @Metric MutableCounterLong numReadStateMachineOps; + private @Metric MutableCounterLong numQueryStateMachineOps; private @Metric MutableCounterLong numApplyTransactionOps; + private @Metric MutableCounterLong numReadStateMachineOps; + private @Metric MutableCounterLong numBytesWrittenCount; + private @Metric MutableCounterLong numBytesCommittedCount; // Failure Metrics private @Metric MutableCounterLong numWriteStateMachineFails; - private @Metric MutableCounterLong numReadStateMachineFails; + private @Metric MutableCounterLong numQueryStateMachineFails; private @Metric MutableCounterLong numApplyTransactionFails; + private @Metric MutableCounterLong numReadStateMachineFails; + private @Metric MutableCounterLong numReadStateMachineMissCount; public CSMMetrics() { } @@ -59,6 +64,10 @@ public void incNumWriteStateMachineOps() { numWriteStateMachineOps.incr(); } + public void incNumQueryStateMachineOps() { + numQueryStateMachineOps.incr(); + } + public void incNumReadStateMachineOps() { numReadStateMachineOps.incr(); } @@ -71,10 +80,26 @@ public void incNumWriteStateMachineFails() { numWriteStateMachineFails.incr(); } + public void incNumQueryStateMachineFails() { + numQueryStateMachineFails.incr(); + } + + public void incNumBytesWrittenCount(long value) { + numBytesWrittenCount.incr(value); + } + + public void incNumBytesCommittedCount(long value) { + numBytesCommittedCount.incr(value); + } + public void incNumReadStateMachineFails() { numReadStateMachineFails.incr(); } + public void incNumReadStateMachineMissCount() { + numReadStateMachineMissCount.incr(); + } + public void incNumApplyTransactionsFails() { numApplyTransactionFails.incr(); } @@ -85,8 +110,8 @@ public long getNumWriteStateMachineOps() { } @VisibleForTesting - public long getNumReadStateMachineOps() { - return numReadStateMachineOps.value(); + public long getNumQueryStateMachineOps() { + return numQueryStateMachineOps.value(); } @VisibleForTesting @@ -100,8 +125,8 @@ public long getNumWriteStateMachineFails() { } @VisibleForTesting - public long getNumReadStateMachineFails() { - return numReadStateMachineFails.value(); + public long getNumQueryStateMachineFails() { + return numQueryStateMachineFails.value(); } @VisibleForTesting @@ -109,6 +134,27 @@ public long getNumApplyTransactionsFails() { return numApplyTransactionFails.value(); } + @VisibleForTesting + public long getNumReadStateMachineFails() { + return numReadStateMachineFails.value(); + } + + @VisibleForTesting + public long getNumReadStateMachineMissCount() { + return numReadStateMachineMissCount.value(); + } + + @VisibleForTesting + public long getNumBytesWrittenCount() { + return numBytesWrittenCount.value(); + } + + @VisibleForTesting + public long getNumBytesCommittedCount() { + return numBytesCommittedCount.value(); + } + + public void unRegister() { MetricsSystem ms = DefaultMetricsSystem.instance(); ms.unregisterSource(SOURCE_NAME); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 42a4d997a8f6c..7a7baec3001b2 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -391,6 +391,8 @@ private CompletableFuture handleWriteChunk( // Remove the future once it finishes execution from the // writeChunkFutureMap. writeChunkFuture.thenApply(r -> { + metrics.incNumBytesWrittenCount( + requestProto.getWriteChunk().getChunkData().getLen()); writeChunkFutureMap.remove(entryIndex); LOG.debug("writeChunk writeStateMachineData completed: blockId " + write .getBlockID() + " logIndex " + entryIndex + " chunkName " + write @@ -438,12 +440,12 @@ public CompletableFuture writeStateMachineData(LogEntryProto entry) { @Override public CompletableFuture query(Message request) { try { - metrics.incNumReadStateMachineOps(); + metrics.incNumQueryStateMachineOps(); final ContainerCommandRequestProto requestProto = getContainerCommandRequestProto(request.getContent()); return CompletableFuture.completedFuture(runCommand(requestProto, null)); } catch (IOException e) { - metrics.incNumReadStateMachineFails(); + metrics.incNumQueryStateMachineFails(); return completeExceptionally(e); } } @@ -520,10 +522,14 @@ public CompletableFuture flushStateMachineData(long index) { public CompletableFuture readStateMachineData( LogEntryProto entry) { StateMachineLogEntryProto smLogEntryProto = entry.getStateMachineLogEntry(); + metrics.incNumReadStateMachineOps(); if (!getStateMachineData(smLogEntryProto).isEmpty()) { return CompletableFuture.completedFuture(ByteString.EMPTY); } try { + // the stateMachine data is not present in the stateMachine cache, + // increment the stateMachine cache miss count + metrics.incNumReadStateMachineMissCount(); final ContainerCommandRequestProto requestProto = getContainerCommandRequestProto( entry.getStateMachineLogEntry().getLogData()); @@ -537,6 +543,7 @@ public CompletableFuture readStateMachineData( getCachedStateMachineData(entry.getIndex(), entry.getTerm(), requestProto)); } catch (ExecutionException e) { + metrics.incNumReadStateMachineFails(); future.completeExceptionally(e); } return future; @@ -547,6 +554,7 @@ public CompletableFuture readStateMachineData( + " cannot have state machine data"); } } catch (Exception e) { + metrics.incNumReadStateMachineFails(); LOG.error("unable to read stateMachineData:" + e); return completeExceptionally(e); } @@ -618,6 +626,10 @@ public CompletableFuture applyTransaction(TransactionContext trx) { applyTransactionCompletionMap .put(index, trx.getLogEntry().getTerm()); Preconditions.checkState(previous == null); + if (cmdType == Type.WriteChunk || cmdType == Type.PutSmallFile) { + metrics.incNumBytesCommittedCount( + requestProto.getWriteChunk().getChunkData().getLen()); + } updateLastApplied(); }); return future; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java index dc9e13375cb5b..21593242a2e28 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java @@ -14,8 +14,10 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. + = GenericTestUtils.getTestDir("dfs").getAbsolutePath() + File.separator; + */ -package org.apache.hadoop.ozone.container.common.transport.server.ratis; + package org.apache.hadoop.ozone.container.common.transport.server.ratis; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; @@ -29,9 +31,9 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos - .ContainerCommandRequestProto; + .ContainerCommandRequestProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos - .ContainerCommandResponseProto; + .ContainerCommandResponseProto; import org.apache.hadoop.hdds.scm.*; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; @@ -42,7 +44,7 @@ import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher; import org.apache.hadoop.ozone.container.common.interfaces.Handler; import org.apache.hadoop.ozone.container.common.transport.server - .XceiverServerSpi; + .XceiverServerSpi; import org.apache.hadoop.ozone.web.utils.OzoneUtils; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.hdds.conf.OzoneConfiguration; @@ -57,13 +59,11 @@ import org.junit.Test; import org.junit.Assert; -/** - * This class tests the metrics of ContainerStateMachine. - */ -public class TestCSMMetrics { - static final String TEST_DIR - = GenericTestUtils.getTestDir("dfs").getAbsolutePath() + File.separator; - + /** + * This class tests the metrics of ContainerStateMachine. + */ + public class TestCSMMetrics { + static final String TEST_DIR @FunctionalInterface interface CheckedBiFunction { OUT apply(LEFT left, RIGHT right) throws THROWABLE; @@ -112,6 +112,8 @@ static void runContainerStateMachineMetrics( assertCounter("NumWriteStateMachineOps", 0L, metric); assertCounter("NumReadStateMachineOps", 0L, metric); assertCounter("NumApplyTransactionOps", 0L, metric); + assertCounter("NumBytesWrittenCount", 0L, metric); + assertCounter("NumBytesCommittedCount", 0L, metric); // Write Chunk BlockID blockID = ContainerTestHelper.getTestBlockID(ContainerTestHelper. @@ -127,7 +129,9 @@ static void runContainerStateMachineMetrics( metric = getMetrics(CSMMetrics.SOURCE_NAME + RaftGroupId.valueOf(pipeline.getId().getId()).toString()); assertCounter("NumWriteStateMachineOps", 1L, metric); + assertCounter("NumBytesWrittenCount", 1024L, metric); assertCounter("NumApplyTransactionOps", 1L, metric); + assertCounter("NumBytesCommittedCount", 1024L, metric); //Read Chunk ContainerProtos.ContainerCommandRequestProto readChunkRequest = @@ -139,7 +143,7 @@ static void runContainerStateMachineMetrics( metric = getMetrics(CSMMetrics.SOURCE_NAME + RaftGroupId.valueOf(pipeline.getId().getId()).toString()); - assertCounter("NumReadStateMachineOps", 1L, metric); + assertCounter("NumQueryStateMachineOps", 1L, metric); assertCounter("NumApplyTransactionOps", 1L, metric); } finally { if (client != null) {