From 4a294b9f48dcc8d3ebb2e639fae1cc9bbd1372ae Mon Sep 17 00:00:00 2001 From: xichen01 Date: Sat, 15 Apr 2023 20:36:35 +0800 Subject: [PATCH 1/6] HDDS-8428. Inconsistent Metrics units in CMSMetrics --- .../ozone/container/common/helpers/ContainerMetrics.java | 2 +- .../common/transport/server/ratis/CSMMetrics.java | 8 ++++---- .../transport/server/ratis/ContainerStateMachine.java | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java index e717da65ff7f..732737023603 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java @@ -83,7 +83,7 @@ public ContainerMetrics(int[] intervals) { for (int j = 0; j < len; j++) { int interval = intervals[j]; - String quantileName = ContainerProtos.Type.forNumber(i + 1) + "Nanos" + String quantileName = ContainerProtos.Type.forNumber(i + 1) + "Millis" + interval + "s"; opsLatQuantiles[i][j] = registry.newQuantiles(quantileName, "latency of Container ops", "ops", "latency", interval); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 557473bf9cef..f5c1f4dbe4c2 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -209,12 +209,12 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } - public void recordApplyTransactionCompletion(long latencyNanos) { - applyTransaction.add(latencyNanos); + public void recordApplyTransactionCompletion(long latencyMillis) { + applyTransaction.add(latencyMillis); } - public void recordWriteStateMachineCompletion(long latencyNanos) { - writeStateMachineData.add(latencyNanos); + public void recordWriteStateMachineCompletion(long latencyMillis) { + writeStateMachineData.add(latencyMillis); } public void incNumDataCacheMiss() { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 7d7b28999fb3..0226103fbd4b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -541,7 +541,7 @@ private CompletableFuture handleWriteChunk( } raftFuture.complete(r::toByteString); metrics.recordWriteStateMachineCompletion( - Time.monotonicNowNanos() - startTime); + Time.monotonicNow() - startTime); } writeChunkFutureMap.remove(entryIndex); @@ -621,7 +621,7 @@ private ExecutorService getChunkExecutor(WriteChunkRequestProto req) { public CompletableFuture write(LogEntryProto entry) { try { metrics.incNumWriteStateMachineOps(); - long writeStateMachineStartTime = Time.monotonicNowNanos(); + long writeStateMachineStartTime = Time.monotonicNow(); ContainerCommandRequestProto requestProto = getContainerCommandRequestProto(gid, entry.getStateMachineLogEntry().getLogData()); @@ -878,7 +878,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { new DispatcherContext.Builder().setTerm(trx.getLogEntry().getTerm()) .setLogIndex(index); - long applyTxnStartTime = Time.monotonicNowNanos(); + long applyTxnStartTime = Time.monotonicNow(); applyTransactionSemaphore.acquire(); metrics.incNumApplyTransactionsOps(); ContainerCommandRequestProto requestProto = @@ -967,7 +967,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { } applyTransactionSemaphore.release(); metrics.recordApplyTransactionCompletion( - Time.monotonicNowNanos() - applyTxnStartTime); + Time.monotonicNow() - applyTxnStartTime); }); return applyTransactionFuture; } catch (InterruptedException e) { From 3003031fbbb93665b3a2f4b63b45b4a030b85111 Mon Sep 17 00:00:00 2001 From: xichen01 Date: Wed, 19 Apr 2023 02:22:31 +0800 Subject: [PATCH 2/6] Add MS or NS time unit suffix for CSMMetrics --- .../common/helpers/ContainerMetrics.java | 2 +- .../transport/server/ratis/CSMMetrics.java | 31 ++++++++++++++++--- .../server/ratis/ContainerStateMachine.java | 14 ++++----- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java index 732737023603..e717da65ff7f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/ContainerMetrics.java @@ -83,7 +83,7 @@ public ContainerMetrics(int[] intervals) { for (int j = 0; j < len; j++) { int interval = intervals[j]; - String quantileName = ContainerProtos.Type.forNumber(i + 1) + "Millis" + String quantileName = ContainerProtos.Type.forNumber(i + 1) + "Nanos" + interval + "s"; opsLatQuantiles[i][j] = registry.newQuantiles(quantileName, "latency of Container ops", "ops", "latency", interval); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index f5c1f4dbe4c2..3749a2fc1135 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -46,8 +46,15 @@ public class CSMMetrics { private @Metric MutableCounterLong numBytesWrittenCount; private @Metric MutableCounterLong numBytesCommittedCount; + @Deprecated + //Use {@link transactionLatencyMS} instead private @Metric MutableRate transactionLatency; + @Deprecated + //Use {@link opsLatencyMS} instead private MutableRate[] opsLatency; + private @Metric MutableRate transactionLatencyMS; + private MutableRate[] opsLatencyMS; + private MetricsRegistry registry = null; // Failure Metrics @@ -62,17 +69,27 @@ public class CSMMetrics { private @Metric MutableCounterLong numDataCacheMiss; private @Metric MutableCounterLong numDataCacheHit; + @Deprecated + //Use {@link applyTransactionNs} instead private @Metric MutableRate applyTransaction; + @Deprecated + //Use {@link writeStateMachineDataNs} instead private @Metric MutableRate writeStateMachineData; + private @Metric MutableRate applyTransactionNs; + private @Metric MutableRate writeStateMachineDataNs; public CSMMetrics() { int numCmdTypes = ContainerProtos.Type.values().length; this.opsLatency = new MutableRate[numCmdTypes]; + this.opsLatencyMS = new MutableRate[numCmdTypes]; this.registry = new MetricsRegistry(CSMMetrics.class.getSimpleName()); for (int i = 0; i < numCmdTypes; i++) { opsLatency[i] = registry.newRate( ContainerProtos.Type.forNumber(i + 1).toString(), ContainerProtos.Type.forNumber(i + 1) + " op"); + opsLatencyMS[i] = registry.newRate( + ContainerProtos.Type.forNumber(i + 1).toString() + "Ms", + ContainerProtos.Type.forNumber(i + 1) + " op"); } } @@ -195,10 +212,12 @@ public MutableRate getApplyTransactionLatency() { return applyTransaction; } - public void incPipelineLatency(ContainerProtos.Type type, + public void incPipelineLatencyMs(ContainerProtos.Type type, long latencyMillis) { opsLatency[type.ordinal()].add(latencyMillis); transactionLatency.add(latencyMillis); + opsLatencyMS[type.ordinal()].add(latencyMillis); + transactionLatencyMS.add(latencyMillis); } public void incNumStartTransactionVerifyFailures() { @@ -209,12 +228,14 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } - public void recordApplyTransactionCompletion(long latencyMillis) { - applyTransaction.add(latencyMillis); + public void recordApplyTransactionNsCompletion(long latencyNanos) { + applyTransactionNs.add(latencyNanos); + applyTransaction.add(latencyNanos); } - public void recordWriteStateMachineCompletion(long latencyMillis) { - writeStateMachineData.add(latencyMillis); + public void recordWriteStateMachineNsCompletion(long latencyNanos) { + writeStateMachineDataNs.add(latencyNanos); + writeStateMachineData.add(latencyNanos); } public void incNumDataCacheMiss() { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 0226103fbd4b..768539bf2554 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -540,8 +540,8 @@ private CompletableFuture handleWriteChunk( write.getChunkData().getChunkName()); } raftFuture.complete(r::toByteString); - metrics.recordWriteStateMachineCompletion( - Time.monotonicNow() - startTime); + metrics.recordWriteStateMachineNsCompletion( + Time.monotonicNowNanos() - startTime); } writeChunkFutureMap.remove(entryIndex); @@ -621,7 +621,7 @@ private ExecutorService getChunkExecutor(WriteChunkRequestProto req) { public CompletableFuture write(LogEntryProto entry) { try { metrics.incNumWriteStateMachineOps(); - long writeStateMachineStartTime = Time.monotonicNow(); + long writeStateMachineStartTime = Time.monotonicNowNanos(); ContainerCommandRequestProto requestProto = getContainerCommandRequestProto(gid, entry.getStateMachineLogEntry().getLogData()); @@ -878,7 +878,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { new DispatcherContext.Builder().setTerm(trx.getLogEntry().getTerm()) .setLogIndex(index); - long applyTxnStartTime = Time.monotonicNow(); + long applyTxnStartTime = Time.monotonicNowNanos(); applyTransactionSemaphore.acquire(); metrics.incNumApplyTransactionsOps(); ContainerCommandRequestProto requestProto = @@ -914,7 +914,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { if (trx.getServerRole() == RaftPeerRole.LEADER && trx.getStateMachineContext() != null) { long startTime = (long) trx.getStateMachineContext(); - metrics.incPipelineLatency(cmdType, + metrics.incPipelineLatencyMs(cmdType, (Time.monotonicNowNanos() - startTime) / 1000000L); } // ignore close container exception while marking the stateMachine @@ -966,8 +966,8 @@ public CompletableFuture applyTransaction(TransactionContext trx) { + "{} exception {}", gid, requestProto.getCmdType(), index, t); } applyTransactionSemaphore.release(); - metrics.recordApplyTransactionCompletion( - Time.monotonicNow() - applyTxnStartTime); + metrics.recordApplyTransactionNsCompletion( + Time.monotonicNowNanos() - applyTxnStartTime); }); return applyTransactionFuture; } catch (InterruptedException e) { From c88331ea0f52e4982dd88c0abde30fd523a0dbfc Mon Sep 17 00:00:00 2001 From: xichen01 Date: Thu, 20 Apr 2023 01:39:06 +0800 Subject: [PATCH 3/6] Remove old Metrics --- .../transport/server/ratis/CSMMetrics.java | 37 ++++--------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 3749a2fc1135..c74e39a9e58a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -46,15 +46,8 @@ public class CSMMetrics { private @Metric MutableCounterLong numBytesWrittenCount; private @Metric MutableCounterLong numBytesCommittedCount; - @Deprecated - //Use {@link transactionLatencyMS} instead - private @Metric MutableRate transactionLatency; - @Deprecated - //Use {@link opsLatencyMS} instead - private MutableRate[] opsLatency; - private @Metric MutableRate transactionLatencyMS; - private MutableRate[] opsLatencyMS; - + private @Metric MutableRate transactionLatencyMs; + private MutableRate[] opsLatencyMs; private MetricsRegistry registry = null; // Failure Metrics @@ -69,25 +62,15 @@ public class CSMMetrics { private @Metric MutableCounterLong numDataCacheMiss; private @Metric MutableCounterLong numDataCacheHit; - @Deprecated - //Use {@link applyTransactionNs} instead - private @Metric MutableRate applyTransaction; - @Deprecated - //Use {@link writeStateMachineDataNs} instead - private @Metric MutableRate writeStateMachineData; private @Metric MutableRate applyTransactionNs; private @Metric MutableRate writeStateMachineDataNs; public CSMMetrics() { int numCmdTypes = ContainerProtos.Type.values().length; - this.opsLatency = new MutableRate[numCmdTypes]; - this.opsLatencyMS = new MutableRate[numCmdTypes]; + this.opsLatencyMs = new MutableRate[numCmdTypes]; this.registry = new MetricsRegistry(CSMMetrics.class.getSimpleName()); for (int i = 0; i < numCmdTypes; i++) { - opsLatency[i] = registry.newRate( - ContainerProtos.Type.forNumber(i + 1).toString(), - ContainerProtos.Type.forNumber(i + 1) + " op"); - opsLatencyMS[i] = registry.newRate( + opsLatencyMs[i] = registry.newRate( ContainerProtos.Type.forNumber(i + 1).toString() + "Ms", ContainerProtos.Type.forNumber(i + 1) + " op"); } @@ -208,16 +191,14 @@ public long getNumBytesCommittedCount() { return numBytesCommittedCount.value(); } - public MutableRate getApplyTransactionLatency() { - return applyTransaction; + public MutableRate getApplyTransactionNsLatency() { + return applyTransactionNs; } public void incPipelineLatencyMs(ContainerProtos.Type type, long latencyMillis) { - opsLatency[type.ordinal()].add(latencyMillis); - transactionLatency.add(latencyMillis); - opsLatencyMS[type.ordinal()].add(latencyMillis); - transactionLatencyMS.add(latencyMillis); + opsLatencyMs[type.ordinal()].add(latencyMillis); + transactionLatencyMs.add(latencyMillis); } public void incNumStartTransactionVerifyFailures() { @@ -230,12 +211,10 @@ public void incNumContainerNotOpenVerifyFailures() { public void recordApplyTransactionNsCompletion(long latencyNanos) { applyTransactionNs.add(latencyNanos); - applyTransaction.add(latencyNanos); } public void recordWriteStateMachineNsCompletion(long latencyNanos) { writeStateMachineDataNs.add(latencyNanos); - writeStateMachineData.add(latencyNanos); } public void incNumDataCacheMiss() { From f7694bec1c5dfe7da692ef2a193107ad2f402116 Mon Sep 17 00:00:00 2001 From: xichen01 Date: Thu, 20 Apr 2023 19:24:22 +0800 Subject: [PATCH 4/6] Fix test --- .../transport/server/ratis/TestCSMMetrics.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java index 157bc94f0060..b3d6589cff78 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java @@ -129,12 +129,12 @@ static void runContainerStateMachineMetrics( assertCounter("NumBytesCommittedCount", 0L, metric); assertCounter("NumStartTransactionVerifyFailures", 0L, metric); assertCounter("NumContainerNotOpenVerifyFailures", 0L, metric); - assertCounter("WriteChunkNumOps", 0L, metric); + assertCounter("WriteChunkMsNumOps", 0L, metric); double applyTransactionLatency = getDoubleGauge( - "ApplyTransactionAvgTime", metric); + "ApplyTransactionNsAvgTime", metric); assertTrue(applyTransactionLatency == 0.0); double writeStateMachineLatency = getDoubleGauge( - "WriteStateMachineDataAvgTime", metric); + "WriteStateMachineDataNsAvgTime", metric); assertTrue(writeStateMachineLatency == 0.0); // Write Chunk @@ -156,7 +156,7 @@ static void runContainerStateMachineMetrics( assertCounter("NumBytesCommittedCount", 1024L, metric); assertCounter("NumStartTransactionVerifyFailures", 0L, metric); assertCounter("NumContainerNotOpenVerifyFailures", 0L, metric); - assertCounter("WriteChunkNumOps", 1L, metric); + assertCounter("WriteChunkMsNumOps", 1L, metric); //Read Chunk ContainerProtos.ContainerCommandRequestProto readChunkRequest = @@ -171,10 +171,10 @@ static void runContainerStateMachineMetrics( assertCounter("NumQueryStateMachineOps", 1L, metric); assertCounter("NumApplyTransactionOps", 1L, metric); applyTransactionLatency = getDoubleGauge( - "ApplyTransactionAvgTime", metric); + "ApplyTransactionNsAvgTime", metric); assertTrue(applyTransactionLatency > 0.0); writeStateMachineLatency = getDoubleGauge( - "WriteStateMachineDataAvgTime", metric); + "WriteStateMachineDataNsAvgTime", metric); assertTrue(writeStateMachineLatency > 0.0); } finally { From d36a86c3193aa05a774f309dbc970d47ea0a3a16 Mon Sep 17 00:00:00 2001 From: xichen01 Date: Thu, 20 Apr 2023 19:24:58 +0800 Subject: [PATCH 5/6] Consistent function name --- .../container/common/transport/server/ratis/CSMMetrics.java | 4 ++-- .../common/transport/server/ratis/ContainerStateMachine.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index c74e39a9e58a..2d9bcd22ac04 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -209,11 +209,11 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } - public void recordApplyTransactionNsCompletion(long latencyNanos) { + public void recordApplyTransactionCompletionNs(long latencyNanos) { applyTransactionNs.add(latencyNanos); } - public void recordWriteStateMachineNsCompletion(long latencyNanos) { + public void recordWriteStateMachineCompletionNs(long latencyNanos) { writeStateMachineDataNs.add(latencyNanos); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 768539bf2554..0af9657b69f4 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -540,7 +540,7 @@ private CompletableFuture handleWriteChunk( write.getChunkData().getChunkName()); } raftFuture.complete(r::toByteString); - metrics.recordWriteStateMachineNsCompletion( + metrics.recordWriteStateMachineCompletionNs( Time.monotonicNowNanos() - startTime); } @@ -966,7 +966,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { + "{} exception {}", gid, requestProto.getCmdType(), index, t); } applyTransactionSemaphore.release(); - metrics.recordApplyTransactionNsCompletion( + metrics.recordApplyTransactionCompletionNs( Time.monotonicNowNanos() - applyTxnStartTime); }); return applyTransactionFuture; From 39e4e2162397e79c0a589792013d042558e3f8ab Mon Sep 17 00:00:00 2001 From: xichen01 Date: Thu, 20 Apr 2023 19:27:48 +0800 Subject: [PATCH 6/6] Consistent function name --- .../container/common/transport/server/ratis/CSMMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 2d9bcd22ac04..d5c4f7ba0e30 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -191,7 +191,7 @@ public long getNumBytesCommittedCount() { return numBytesCommittedCount.value(); } - public MutableRate getApplyTransactionNsLatency() { + public MutableRate getApplyTransactionLatencyNs() { return applyTransactionNs; }