Skip to content

Commit

Permalink
HDDS-10900. Add some health state metrics for EC container
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanbenWang committed May 23, 2024
1 parent 6f30f2f commit 9ff0e07
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -244,13 +244,13 @@ public ReplicationManager(final ConfigurationSource conf,
TimeUnit.MILLISECONDS);
this.containerReplicaPendingOps = replicaPendingOps;
this.legacyReplicationManager = legacyReplicationManager;
this.ecReplicationCheckHandler = new ECReplicationCheckHandler();
this.metrics = ReplicationManagerMetrics.create(this);
this.ecReplicationCheckHandler = new ECReplicationCheckHandler(metrics);
this.ecMisReplicationCheckHandler =
new ECMisReplicationCheckHandler(ecContainerPlacement);
this.ratisReplicationCheckHandler =
new RatisReplicationCheckHandler(ratisContainerPlacement, this);
this.nodeManager = nodeManager;
this.metrics = ReplicationManagerMetrics.create(this);

ecUnderReplicationHandler = new ECUnderReplicationHandler(
ecContainerPlacement, conf, this);
Expand Down Expand Up @@ -365,7 +365,7 @@ protected void startSubServices() {

/**
* Process all the containers now, and wait for the processing to complete.
* This in intended to be used in tests.
* This is intended to be used in tests.
*/
public synchronized void processAll() {
if (!shouldRun()) {
Expand All @@ -380,6 +380,12 @@ public synchronized void processAll() {
containerManager.getContainers();
ReplicationManagerReport report = new ReplicationManagerReport();
ReplicationQueue newRepQueue = new ReplicationQueue();

getMetrics().resetEcUnderReplicatedContainers();
getMetrics().resetEcCriticalUnderReplicatedContainers();
getMetrics().resetEcUnhealthyReplicatedContainers();
getMetrics().resetEcMissingContainers();

for (ContainerInfo c : containers) {
if (!shouldRun()) {
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.hadoop.metrics2.lib.Interns;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.ozone.OzoneConsts;

Expand Down Expand Up @@ -215,7 +216,7 @@ public final class ReplicationManagerMetrics implements MetricsSource {
"mis-replicated EC container due to insufficient nodes available.")
private MutableCounterLong ecPartialReplicationForMisReplicationTotal;

@Metric("NUmber of Reconstruct EC Container commands that could not be sent "
@Metric("Number of Reconstruct EC Container commands that could not be sent "
+ "due to the pending commands on the target datanode")
private MutableCounterLong ecReconstructionCmdsDeferredTotal;

Expand All @@ -227,6 +228,20 @@ public final class ReplicationManagerMetrics implements MetricsSource {
+ "to the pending commands on all source datanodes")
private MutableCounterLong replicateContainerCmdsDeferredTotal;

@Metric("Number of EC containers with insufficient replicas")
private MutableGaugeLong ecUnderReplicatedContainers;

@Metric("Number of EC containers with insufficient replicas that equal "
+ "the number of data")
private MutableGaugeLong ecCriticalUnderReplicatedContainers;

@Metric("Number of EC containers with insufficient replicas that are "
+ "less than the number of data replicas")
private MutableGaugeLong ecUnhealthyReplicatedContainers;

@Metric("EC Containers with no online replicas")
private MutableGaugeLong ecMissingContainers;


public ReplicationManagerMetrics(ReplicationManager manager) {
this.registry = new MetricsRegistry(METRICS_SOURCE_NAME);
Expand Down Expand Up @@ -283,6 +298,11 @@ public void getMetrics(MetricsCollector collector, boolean all) {
builder.addGauge(e.getValue(), report.getStat(e.getKey()));
}

ecUnderReplicatedContainers.snapshot(builder, all);
ecCriticalUnderReplicatedContainers.snapshot(builder, all);
ecUnhealthyReplicatedContainers.snapshot(builder, all);
ecMissingContainers.snapshot(builder, all);

replicationCmdsSentTotal.snapshot(builder, all);
replicasCreatedTotal.snapshot(builder, all);
replicaCreateTimeoutTotal.snapshot(builder, all);
Expand Down Expand Up @@ -614,4 +634,52 @@ public long getPartialReplicationForMisReplicationTotal() {
return this.partialReplicationForMisReplicationTotal.value();
}

public void incrEcUnderReplicatedContainers() {
this.ecUnderReplicatedContainers.incr();
}

public long getEcUnderReplicatedContainers() {
return this.ecUnderReplicatedContainers.value();
}

public void resetEcUnderReplicatedContainers() {
ecUnderReplicatedContainers.set(0);
}

public void incrEcCriticalUnderReplicatedContainers() {
this.ecCriticalUnderReplicatedContainers.incr();
}

public long getEcCriticalUnderReplicatedContainers() {
return this.ecCriticalUnderReplicatedContainers.value();
}

public void resetEcCriticalUnderReplicatedContainers() {
ecCriticalUnderReplicatedContainers.set(0);
}

public void incrEcUnhealthyReplicatedContainers() {
this.ecUnhealthyReplicatedContainers.incr();
}

public long getEcUnhealthyReplicatedContainers() {
return this.ecUnhealthyReplicatedContainers.value();
}

public void resetEcUnhealthyReplicatedContainers() {
ecUnhealthyReplicatedContainers.set(0);
}

public void incrEcMissingContainers() {
this.ecMissingContainers.incr();
}

public long getEcMissingContainers() {
return this.ecMissingContainers.value();
}

public void resetEcMissingContainers() {
ecMissingContainers.set(0);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
import org.apache.hadoop.hdds.scm.container.replication.ECContainerReplicaCount;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManagerMetrics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -43,6 +44,12 @@ public class ECReplicationCheckHandler extends AbstractCheck {
private static final Logger LOG =
LoggerFactory.getLogger(ECReplicationCheckHandler.class);

private final ReplicationManagerMetrics metrics;

public ECReplicationCheckHandler(ReplicationManagerMetrics replicationManagerMetrics) {
this.metrics = replicationManagerMetrics;
}

@Override
public boolean handle(ContainerCheckRequest request) {
if (request.getContainerInfo().getReplicationType() != EC) {
Expand All @@ -67,22 +74,29 @@ public boolean handle(ContainerCheckRequest request) {
if (underHealth.isMissing()) {
report.incrementAndSample(
ReplicationManagerReport.HealthState.MISSING, containerID);
metrics.incrEcMissingContainers();
} else {
// A container which is unrecoverable but not missing must have too
// many unhealthy replicas. Therefore it is UNHEALTHY rather than
// missing.
report.incrementAndSample(
ReplicationManagerReport.HealthState.UNHEALTHY, containerID);
metrics.incrEcUnhealthyReplicatedContainers();
}
// An EC container can be both unrecoverable and have offline replicas. In this case, we need
// to report both states as the decommission monitor needs to wait for an extra copy to be
// made of the offline replica before decommission can complete.
if (underHealth.hasUnreplicatedOfflineIndexes()) {
report.incrementAndSample(ReplicationManagerReport.HealthState.UNDER_REPLICATED, containerID);
metrics.incrEcUnderReplicatedContainers();
}
} else {
report.incrementAndSample(
ReplicationManagerReport.HealthState.UNDER_REPLICATED, containerID);
metrics.incrEcUnderReplicatedContainers();
if (isCritical(request)) {
metrics.incrEcCriticalUnderReplicatedContainers();
}
}
if (!underHealth.isReplicatedOkAfterPending() &&
(!underHealth.isUnrecoverable()
Expand Down Expand Up @@ -173,4 +187,16 @@ public ContainerHealthResult checkHealth(ContainerCheckRequest request) {
return new ContainerHealthResult.HealthyResult(container);
}

private boolean isCritical(ContainerCheckRequest request) {
ContainerInfo container = request.getContainerInfo();
Set<ContainerReplica> replicas = request.getContainerReplicas();
List<ContainerReplicaOp> replicaPendingOps = request.getPendingOps();
ECContainerReplicaCount replicaCount =
new ECContainerReplicaCount(container, replicas, replicaPendingOps,
request.getMaintenanceRedundancy());
ECReplicationConfig repConfig =
(ECReplicationConfig) container.getReplicationConfig();
return replicaCount.unavailableIndexes(true).size() == repConfig.getParity();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@
import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.HealthState;

import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManagerMetrics;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -76,13 +79,21 @@ public class TestECReplicationCheckHandler {
private ContainerCheckRequest.Builder requestBuilder;
private ReplicationManagerReport report;
private PlacementPolicy placementPolicy;
private ReplicationManagerMetrics metrics;

@BeforeEach
public void setup() {
placementPolicy = mock(PlacementPolicy.class);
when(placementPolicy.validateContainerPlacement(anyList(), anyInt()))
.thenReturn(new ContainerPlacementStatusDefault(2, 2, 3));
healthCheck = new ECReplicationCheckHandler();
ReplicationManager replicationManager = mock(ReplicationManager.class);
ReplicationManager.ReplicationManagerConfiguration rmConf =
new ReplicationManager.ReplicationManagerConfiguration();
when(replicationManager.getConfig())
.thenReturn(rmConf);
metrics = ReplicationManagerMetrics.create(replicationManager);
when(replicationManager.getMetrics()).thenReturn(metrics);
healthCheck = new ECReplicationCheckHandler(metrics);
repConfig = new ECReplicationConfig(3, 2);
repQueue = new ReplicationQueue();
report = new ReplicationManagerReport();
Expand All @@ -93,6 +104,13 @@ public void setup() {
.setReport(report);
}

@AfterEach
void cleanup() {
if (metrics != null) {
metrics.unRegister();
}
}

@Test
public void testHealthyContainerIsHealthy() {
ContainerInfo container = createContainerInfo(repConfig);
Expand Down Expand Up @@ -132,6 +150,8 @@ public void testUnderReplicatedContainerIsUnderReplicated() {
assertEquals(0, repQueue.overReplicatedQueueSize());
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -161,6 +181,8 @@ public void testUnderReplicatedContainerFixedWithPending() {
// Still under replicated until the pending complete
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -188,6 +210,8 @@ public void testUnderReplicatedDueToOutOfService() {
// Still under replicated until the pending complete
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -220,6 +244,8 @@ public void testUnderReplicatedDueToOutOfServiceFixedWithPending() {
// Still under replicated until the pending complete
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -249,6 +275,8 @@ public void testUnderReplicatedDueToOutOfServiceAndMissingReplica() {
assertEquals(0, repQueue.overReplicatedQueueSize());
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -279,6 +307,8 @@ public void testUnderReplicatedAndUnrecoverable() {
ReplicationManagerReport.HealthState.MISSING));
assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.UNHEALTHY));

assertEquals(1, metrics.getEcMissingContainers());
}

@Test
Expand Down Expand Up @@ -312,6 +342,8 @@ public void testUnderReplicatedAndUnhealthy() {
ReplicationManagerReport.HealthState.MISSING));
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNHEALTHY));

assertEquals(1, metrics.getEcUnhealthyReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -352,6 +384,9 @@ private void testUnderReplicatedAndUnrecoverableWithOffline(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.MISSING));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
assertEquals(1, metrics.getEcMissingContainers());
}

@Test
Expand Down Expand Up @@ -398,6 +433,9 @@ private void testUnderReplicatedAndUnrecoverableWithOfflinePending(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.MISSING));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
assertEquals(1, metrics.getEcMissingContainers());
}

/**
Expand Down Expand Up @@ -438,6 +476,9 @@ public void testUnderReplicatedDueToUnhealthyReplicas() {
assertEquals(0, repQueue.overReplicatedQueueSize());
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
assertEquals(1, metrics.getEcCriticalUnderReplicatedContainers());
}

/**
Expand Down Expand Up @@ -581,6 +622,8 @@ public void testOverAndUnderReplicated() {
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down Expand Up @@ -614,6 +657,8 @@ public void testUnderAndMisReplicatedContainer() {
ReplicationManagerReport.HealthState.OVER_REPLICATED));
assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.MIS_REPLICATED));

assertEquals(1, metrics.getEcUnderReplicatedContainers());
}

@Test
Expand Down

0 comments on commit 9ff0e07

Please sign in to comment.