diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java index b43caabd8d8..b59bd71c122 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java @@ -244,13 +244,13 @@ public ReplicationManager(final ConfigurationSource conf, TimeUnit.MILLISECONDS); this.containerReplicaPendingOps = replicaPendingOps; this.legacyReplicationManager = legacyReplicationManager; - this.ecReplicationCheckHandler = new ECReplicationCheckHandler(); + this.metrics = ReplicationManagerMetrics.create(this); + this.ecReplicationCheckHandler = new ECReplicationCheckHandler(metrics); this.ecMisReplicationCheckHandler = new ECMisReplicationCheckHandler(ecContainerPlacement); this.ratisReplicationCheckHandler = new RatisReplicationCheckHandler(ratisContainerPlacement, this); this.nodeManager = nodeManager; - this.metrics = ReplicationManagerMetrics.create(this); ecUnderReplicationHandler = new ECUnderReplicationHandler( ecContainerPlacement, conf, this); @@ -365,7 +365,7 @@ protected void startSubServices() { /** * Process all the containers now, and wait for the processing to complete. - * This in intended to be used in tests. + * This is intended to be used in tests. */ public synchronized void processAll() { if (!shouldRun()) { @@ -380,6 +380,12 @@ public synchronized void processAll() { containerManager.getContainers(); ReplicationManagerReport report = new ReplicationManagerReport(); ReplicationQueue newRepQueue = new ReplicationQueue(); + + getMetrics().resetEcUnderReplicatedContainers(); + getMetrics().resetEcCriticalUnderReplicatedContainers(); + getMetrics().resetEcUnhealthyReplicatedContainers(); + getMetrics().resetEcMissingContainers(); + for (ContainerInfo c : containers) { if (!shouldRun()) { break; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java index eb75db9bd50..6c8cec35e3e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java @@ -30,6 +30,7 @@ import org.apache.hadoop.metrics2.lib.Interns; import org.apache.hadoop.metrics2.lib.MetricsRegistry; import org.apache.hadoop.metrics2.lib.MutableCounterLong; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.ozone.OzoneConsts; @@ -215,7 +216,7 @@ public final class ReplicationManagerMetrics implements MetricsSource { "mis-replicated EC container due to insufficient nodes available.") private MutableCounterLong ecPartialReplicationForMisReplicationTotal; - @Metric("NUmber of Reconstruct EC Container commands that could not be sent " + @Metric("Number of Reconstruct EC Container commands that could not be sent " + "due to the pending commands on the target datanode") private MutableCounterLong ecReconstructionCmdsDeferredTotal; @@ -227,6 +228,20 @@ public final class ReplicationManagerMetrics implements MetricsSource { + "to the pending commands on all source datanodes") private MutableCounterLong replicateContainerCmdsDeferredTotal; + @Metric("Number of EC containers with insufficient replicas") + private MutableGaugeLong ecUnderReplicatedContainers; + + @Metric("Number of EC containers with insufficient replicas that equal " + + "the number of data") + private MutableGaugeLong ecCriticalUnderReplicatedContainers; + + @Metric("Number of EC containers with insufficient replicas that are " + + "less than the number of data replicas") + private MutableGaugeLong ecUnhealthyReplicatedContainers; + + @Metric("EC Containers with no online replicas") + private MutableGaugeLong ecMissingContainers; + public ReplicationManagerMetrics(ReplicationManager manager) { this.registry = new MetricsRegistry(METRICS_SOURCE_NAME); @@ -283,6 +298,11 @@ public void getMetrics(MetricsCollector collector, boolean all) { builder.addGauge(e.getValue(), report.getStat(e.getKey())); } + ecUnderReplicatedContainers.snapshot(builder, all); + ecCriticalUnderReplicatedContainers.snapshot(builder, all); + ecUnhealthyReplicatedContainers.snapshot(builder, all); + ecMissingContainers.snapshot(builder, all); + replicationCmdsSentTotal.snapshot(builder, all); replicasCreatedTotal.snapshot(builder, all); replicaCreateTimeoutTotal.snapshot(builder, all); @@ -614,4 +634,52 @@ public long getPartialReplicationForMisReplicationTotal() { return this.partialReplicationForMisReplicationTotal.value(); } + public void incrEcUnderReplicatedContainers() { + this.ecUnderReplicatedContainers.incr(); + } + + public long getEcUnderReplicatedContainers() { + return this.ecUnderReplicatedContainers.value(); + } + + public void resetEcUnderReplicatedContainers() { + ecUnderReplicatedContainers.set(0); + } + + public void incrEcCriticalUnderReplicatedContainers() { + this.ecCriticalUnderReplicatedContainers.incr(); + } + + public long getEcCriticalUnderReplicatedContainers() { + return this.ecCriticalUnderReplicatedContainers.value(); + } + + public void resetEcCriticalUnderReplicatedContainers() { + ecCriticalUnderReplicatedContainers.set(0); + } + + public void incrEcUnhealthyReplicatedContainers() { + this.ecUnhealthyReplicatedContainers.incr(); + } + + public long getEcUnhealthyReplicatedContainers() { + return this.ecUnhealthyReplicatedContainers.value(); + } + + public void resetEcUnhealthyReplicatedContainers() { + ecUnhealthyReplicatedContainers.set(0); + } + + public void incrEcMissingContainers() { + this.ecMissingContainers.incr(); + } + + public long getEcMissingContainers() { + return this.ecMissingContainers.value(); + } + + public void resetEcMissingContainers() { + ecMissingContainers.set(0); + } + } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java index 1b58694441f..723d83709d9 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp; import org.apache.hadoop.hdds.scm.container.replication.ECContainerReplicaCount; +import org.apache.hadoop.hdds.scm.container.replication.ReplicationManagerMetrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,6 +44,12 @@ public class ECReplicationCheckHandler extends AbstractCheck { private static final Logger LOG = LoggerFactory.getLogger(ECReplicationCheckHandler.class); + private final ReplicationManagerMetrics metrics; + + public ECReplicationCheckHandler(ReplicationManagerMetrics replicationManagerMetrics) { + this.metrics = replicationManagerMetrics; + } + @Override public boolean handle(ContainerCheckRequest request) { if (request.getContainerInfo().getReplicationType() != EC) { @@ -67,22 +74,29 @@ public boolean handle(ContainerCheckRequest request) { if (underHealth.isMissing()) { report.incrementAndSample( ReplicationManagerReport.HealthState.MISSING, containerID); + metrics.incrEcMissingContainers(); } else { // A container which is unrecoverable but not missing must have too // many unhealthy replicas. Therefore it is UNHEALTHY rather than // missing. report.incrementAndSample( ReplicationManagerReport.HealthState.UNHEALTHY, containerID); + metrics.incrEcUnhealthyReplicatedContainers(); } // An EC container can be both unrecoverable and have offline replicas. In this case, we need // to report both states as the decommission monitor needs to wait for an extra copy to be // made of the offline replica before decommission can complete. if (underHealth.hasUnreplicatedOfflineIndexes()) { report.incrementAndSample(ReplicationManagerReport.HealthState.UNDER_REPLICATED, containerID); + metrics.incrEcUnderReplicatedContainers(); } } else { report.incrementAndSample( ReplicationManagerReport.HealthState.UNDER_REPLICATED, containerID); + metrics.incrEcUnderReplicatedContainers(); + if (isCritical(request)) { + metrics.incrEcCriticalUnderReplicatedContainers(); + } } if (!underHealth.isReplicatedOkAfterPending() && (!underHealth.isUnrecoverable() @@ -173,4 +187,16 @@ public ContainerHealthResult checkHealth(ContainerCheckRequest request) { return new ContainerHealthResult.HealthyResult(container); } + private boolean isCritical(ContainerCheckRequest request) { + ContainerInfo container = request.getContainerInfo(); + Set replicas = request.getContainerReplicas(); + List replicaPendingOps = request.getPendingOps(); + ECContainerReplicaCount replicaCount = + new ECContainerReplicaCount(container, replicas, replicaPendingOps, + request.getMaintenanceRedundancy()); + ECReplicationConfig repConfig = + (ECReplicationConfig) container.getReplicationConfig(); + return replicaCount.unavailableIndexes(true).size() == repConfig.getParity(); + } + } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java index c0efa2550a6..c1c3916394e 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java @@ -33,8 +33,11 @@ import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.HealthState; import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp; +import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager; +import org.apache.hadoop.hdds.scm.container.replication.ReplicationManagerMetrics; import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue; import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -76,13 +79,21 @@ public class TestECReplicationCheckHandler { private ContainerCheckRequest.Builder requestBuilder; private ReplicationManagerReport report; private PlacementPolicy placementPolicy; + private ReplicationManagerMetrics metrics; @BeforeEach public void setup() { placementPolicy = mock(PlacementPolicy.class); when(placementPolicy.validateContainerPlacement(anyList(), anyInt())) .thenReturn(new ContainerPlacementStatusDefault(2, 2, 3)); - healthCheck = new ECReplicationCheckHandler(); + ReplicationManager replicationManager = mock(ReplicationManager.class); + ReplicationManager.ReplicationManagerConfiguration rmConf = + new ReplicationManager.ReplicationManagerConfiguration(); + when(replicationManager.getConfig()) + .thenReturn(rmConf); + metrics = ReplicationManagerMetrics.create(replicationManager); + when(replicationManager.getMetrics()).thenReturn(metrics); + healthCheck = new ECReplicationCheckHandler(metrics); repConfig = new ECReplicationConfig(3, 2); repQueue = new ReplicationQueue(); report = new ReplicationManagerReport(); @@ -93,6 +104,13 @@ public void setup() { .setReport(report); } + @AfterEach + void cleanup() { + if (metrics != null) { + metrics.unRegister(); + } + } + @Test public void testHealthyContainerIsHealthy() { ContainerInfo container = createContainerInfo(repConfig); @@ -132,6 +150,8 @@ public void testUnderReplicatedContainerIsUnderReplicated() { assertEquals(0, repQueue.overReplicatedQueueSize()); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -161,6 +181,8 @@ public void testUnderReplicatedContainerFixedWithPending() { // Still under replicated until the pending complete assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -188,6 +210,8 @@ public void testUnderReplicatedDueToOutOfService() { // Still under replicated until the pending complete assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -220,6 +244,8 @@ public void testUnderReplicatedDueToOutOfServiceFixedWithPending() { // Still under replicated until the pending complete assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -249,6 +275,8 @@ public void testUnderReplicatedDueToOutOfServiceAndMissingReplica() { assertEquals(0, repQueue.overReplicatedQueueSize()); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -279,6 +307,8 @@ public void testUnderReplicatedAndUnrecoverable() { ReplicationManagerReport.HealthState.MISSING)); assertEquals(0, report.getStat( ReplicationManagerReport.HealthState.UNHEALTHY)); + + assertEquals(1, metrics.getEcMissingContainers()); } @Test @@ -312,6 +342,8 @@ public void testUnderReplicatedAndUnhealthy() { ReplicationManagerReport.HealthState.MISSING)); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNHEALTHY)); + + assertEquals(1, metrics.getEcUnhealthyReplicatedContainers()); } @Test @@ -352,6 +384,9 @@ private void testUnderReplicatedAndUnrecoverableWithOffline( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.MISSING)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); + assertEquals(1, metrics.getEcMissingContainers()); } @Test @@ -398,6 +433,9 @@ private void testUnderReplicatedAndUnrecoverableWithOfflinePending( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.MISSING)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); + assertEquals(1, metrics.getEcMissingContainers()); } /** @@ -438,6 +476,9 @@ public void testUnderReplicatedDueToUnhealthyReplicas() { assertEquals(0, repQueue.overReplicatedQueueSize()); assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); + assertEquals(1, metrics.getEcCriticalUnderReplicatedContainers()); } /** @@ -581,6 +622,8 @@ public void testOverAndUnderReplicated() { ReplicationManagerReport.HealthState.UNDER_REPLICATED)); assertEquals(0, report.getStat( ReplicationManagerReport.HealthState.OVER_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test @@ -614,6 +657,8 @@ public void testUnderAndMisReplicatedContainer() { ReplicationManagerReport.HealthState.OVER_REPLICATED)); assertEquals(0, report.getStat( ReplicationManagerReport.HealthState.MIS_REPLICATED)); + + assertEquals(1, metrics.getEcUnderReplicatedContainers()); } @Test