Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,49 @@

package org.apache.hadoop.ozone.recon.metrics;

import com.google.common.base.CaseFormat;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.hdds.annotation.InterfaceAudience;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsInfo;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.metrics2.lib.Interns;
import org.apache.hadoop.ozone.OzoneConsts;

/**
* Metrics for Recon SCM targeted sync execution.
*/
@InterfaceAudience.Private
@Metrics(about = "Recon SCM Container Sync Metrics", context = OzoneConsts.OZONE)
public final class ReconScmContainerSyncMetrics {
public final class ReconScmContainerSyncMetrics implements MetricsSource {

private static final String SOURCE_NAME =
ReconScmContainerSyncMetrics.class.getSimpleName();

private static final HddsProtos.LifeCycleState[] SYNC_STATES = {
HddsProtos.LifeCycleState.OPEN,
HddsProtos.LifeCycleState.QUASI_CLOSED,
HddsProtos.LifeCycleState.CLOSED,
HddsProtos.LifeCycleState.DELETED
};

private static final MetricsInfo SCM_CONTAINER_SYNC_STATUS = Interns.info(
"scmContainerSyncStatus",
"SCM container sync status: 0=idle, 1=in progress, 2=success, 3=failure");

private static final MetricsInfo LAST_TARGETED_SYNC_DURATION_MS = Interns.info(
"lastTargetedSyncDurationMs",
"Time taken by the last targeted sync in milliseconds");

/**
* No targeted sync has run yet, or the latest scheduler cycle did not run one.
*/
Expand All @@ -53,14 +77,22 @@ public final class ReconScmContainerSyncMetrics {
*/
public static final int TARGETED_SYNC_STATUS_FAILURE = 3;

@Metric(about = "Targeted sync status: 0=idle, 1=in progress, "
+ "2=success, 3=failure")
private MutableGaugeInt targetedSyncStatus;

@Metric(about = "Time taken by the last targeted sync in milliseconds")
private MutableGaugeLong lastTargetedSyncDurationMs;
private final AtomicInteger scmContainerSyncStatus = new AtomicInteger();
private final AtomicLong lastTargetedSyncDurationMs = new AtomicLong();
private final Map<HddsProtos.LifeCycleState, AtomicLong>
containerSyncDurationMs;
private final Map<HddsProtos.LifeCycleState, AtomicLong>
containerCountDrift;
private final Map<HddsProtos.LifeCycleState, MetricsInfo>
containerSyncDurationMetricInfo;
private final Map<HddsProtos.LifeCycleState, MetricsInfo>
containerCountDriftMetricInfo;

private ReconScmContainerSyncMetrics() {
containerSyncDurationMs = initStateGaugeValues();
containerCountDrift = initStateGaugeValues();
containerSyncDurationMetricInfo = initSyncDurationMetricInfo();
containerCountDriftMetricInfo = initCountDriftMetricInfo();
}

public static ReconScmContainerSyncMetrics create() {
Expand All @@ -75,19 +107,115 @@ public void unRegister() {
ms.unregisterSource(SOURCE_NAME);
}

public void setTargetedSyncStatus(int status) {
targetedSyncStatus.set(status);
public void setScmContainerSyncStatus(int status) {
scmContainerSyncStatus.set(status);
}

public void setLastTargetedSyncDurationMs(long durationMs) {
lastTargetedSyncDurationMs.set(durationMs);
}

public int getTargetedSyncStatus() {
return targetedSyncStatus.value();
public void setContainerSyncDurationMs(
HddsProtos.LifeCycleState state, long durationMs) {
setStateGauge(containerSyncDurationMs, state, durationMs);
}

public void setContainerCountDrift(
HddsProtos.LifeCycleState state, long drift) {
setStateGauge(containerCountDrift, state, drift);
}

public int getScmContainerSyncStatus() {
return scmContainerSyncStatus.get();
}

public long getLastTargetedSyncDurationMs() {
return lastTargetedSyncDurationMs.value();
return lastTargetedSyncDurationMs.get();
}

public long getContainerSyncDurationMs(
HddsProtos.LifeCycleState state) {
return getStateGauge(containerSyncDurationMs, state);
}

public long getContainerCountDrift(
HddsProtos.LifeCycleState state) {
return getStateGauge(containerCountDrift, state);
}

@Override
public void getMetrics(MetricsCollector collector, boolean all) {
MetricsRecordBuilder builder = collector.addRecord(SOURCE_NAME);
builder.addGauge(SCM_CONTAINER_SYNC_STATUS, getScmContainerSyncStatus());
builder.addGauge(LAST_TARGETED_SYNC_DURATION_MS,
getLastTargetedSyncDurationMs());
for (HddsProtos.LifeCycleState state : SYNC_STATES) {
builder.addGauge(containerSyncDurationMetricInfo.get(state),
getContainerSyncDurationMs(state));
builder.addGauge(containerCountDriftMetricInfo.get(state),
getContainerCountDrift(state));
}
}

private static Map<HddsProtos.LifeCycleState, AtomicLong>
initStateGaugeValues() {
Map<HddsProtos.LifeCycleState, AtomicLong> gauges =
new EnumMap<>(HddsProtos.LifeCycleState.class);
for (HddsProtos.LifeCycleState state : SYNC_STATES) {
gauges.put(state, new AtomicLong());
}
return Collections.unmodifiableMap(gauges);
}

private static Map<HddsProtos.LifeCycleState, MetricsInfo>
initSyncDurationMetricInfo() {
Map<HddsProtos.LifeCycleState, MetricsInfo> metrics =
new EnumMap<>(HddsProtos.LifeCycleState.class);
for (HddsProtos.LifeCycleState state : SYNC_STATES) {
String stateName = metricStateName(state);
metrics.put(state, Interns.info(
CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, stateName)
+ "ContainerSyncDurationMs",
"Time taken by the " + stateName
+ " container sync pass in milliseconds"));
}
return Collections.unmodifiableMap(metrics);
}

private static Map<HddsProtos.LifeCycleState, MetricsInfo>
initCountDriftMetricInfo() {
Map<HddsProtos.LifeCycleState, MetricsInfo> metrics =
new EnumMap<>(HddsProtos.LifeCycleState.class);
for (HddsProtos.LifeCycleState state : SYNC_STATES) {
String stateName = metricStateName(state);
metrics.put(state, Interns.info(
CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, stateName)
+ "ContainerCountDrift",
"Pre-sync observed " + stateName
+ " container count drift, computed as SCM count minus Recon count"));
}
return Collections.unmodifiableMap(metrics);
}

private static String metricStateName(HddsProtos.LifeCycleState state) {
return CaseFormat.UPPER_UNDERSCORE.to(
CaseFormat.UPPER_CAMEL, state.name());
}

private static void setStateGauge(
Map<HddsProtos.LifeCycleState, AtomicLong> gauges,
HddsProtos.LifeCycleState state,
long value) {
AtomicLong gauge = gauges.get(state);
if (gauge != null) {
gauge.set(value);
}
}

private static long getStateGauge(
Map<HddsProtos.LifeCycleState, AtomicLong> gauges,
HddsProtos.LifeCycleState state) {
AtomicLong gauge = gauges.get(state);
return gauge != null ? gauge.get() : 0L;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,8 @@ public ReconStorageContainerManagerFacade(OzoneConfiguration conf,
containerSyncHelper = new ReconStorageContainerSyncHelper(
scmServiceProvider,
ozoneConfiguration,
containerManager
containerManager,
containerSyncMetrics
);
}

Expand Down Expand Up @@ -911,16 +912,16 @@ public boolean syncWithSCMContainerInfo() {

private boolean runTargetedSyncWithMetrics() {
long startTime = Time.monotonicNow();
containerSyncMetrics.setTargetedSyncStatus(
containerSyncMetrics.setScmContainerSyncStatus(
ReconScmContainerSyncMetrics.TARGETED_SYNC_STATUS_IN_PROGRESS);
try {
boolean success = containerSyncHelper.syncWithSCMContainerInfo();
containerSyncMetrics.setTargetedSyncStatus(success
containerSyncMetrics.setScmContainerSyncStatus(success
? ReconScmContainerSyncMetrics.TARGETED_SYNC_STATUS_SUCCESS
: ReconScmContainerSyncMetrics.TARGETED_SYNC_STATUS_FAILURE);
return success;
} catch (RuntimeException | Error e) {
containerSyncMetrics.setTargetedSyncStatus(
containerSyncMetrics.setScmContainerSyncStatus(
ReconScmContainerSyncMetrics.TARGETED_SYNC_STATUS_FAILURE);
throw e;
} finally {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline;
import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
import org.apache.hadoop.ozone.recon.metrics.ReconScmContainerSyncMetrics;
import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -140,13 +142,22 @@ class ReconStorageContainerSyncHelper {
private final StorageContainerServiceProvider scmServiceProvider;
private final OzoneConfiguration ozoneConfiguration;
private final ReconContainerManager containerManager;
private final ReconScmContainerSyncMetrics containerSyncMetrics;

ReconStorageContainerSyncHelper(StorageContainerServiceProvider scmServiceProvider,
OzoneConfiguration ozoneConfiguration,
ReconContainerManager containerManager) {
this(scmServiceProvider, ozoneConfiguration, containerManager, null);
}

ReconStorageContainerSyncHelper(StorageContainerServiceProvider scmServiceProvider,
OzoneConfiguration ozoneConfiguration,
ReconContainerManager containerManager,
ReconScmContainerSyncMetrics containerSyncMetrics) {
this.scmServiceProvider = scmServiceProvider;
this.ozoneConfiguration = ozoneConfiguration;
this.containerManager = containerManager;
this.containerSyncMetrics = containerSyncMetrics;
}

/**
Expand All @@ -167,8 +178,10 @@ public boolean syncWithSCMContainerInfo() {
*/
private boolean syncContainersForState(HddsProtos.LifeCycleState scmState,
boolean incrementalOpen) {
long startTime = Time.monotonicNow();
try {
long total = scmServiceProvider.getContainerCount(scmState);
updateContainerCountDrift(scmState, total);
if (total == 0) {
LOG.debug("{} sync: no containers found in SCM.", scmState);
return true;
Expand Down Expand Up @@ -222,6 +235,8 @@ private boolean syncContainersForState(HddsProtos.LifeCycleState scmState,
} catch (Exception e) {
LOG.error("{} sync: unexpected error.", scmState, e);
return false;
} finally {
updateContainerSyncDuration(scmState, Time.monotonicNow() - startTime);
}
}

Expand Down Expand Up @@ -359,7 +374,9 @@ private int rebuildContainerFromScm(ContainerID containerID,
* @return {@code true} if all RPC calls completed without error
*/
private boolean syncDeletedContainers() {
long startTime = Time.monotonicNow();
try {
updateDeletedContainerCountDrift();
int configuredBatch = ozoneConfiguration.getInt(
OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE,
OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE_DEFAULT);
Expand Down Expand Up @@ -390,6 +407,39 @@ private boolean syncDeletedContainers() {
} catch (Exception e) {
LOG.error("DELETED sync: unexpected error.", e);
return false;
} finally {
updateContainerSyncDuration(HddsProtos.LifeCycleState.DELETED,
Time.monotonicNow() - startTime);
}
}

private void updateDeletedContainerCountDrift() {
if (containerSyncMetrics == null) {
return;
}
try {
long total = scmServiceProvider.getContainerCount(
HddsProtos.LifeCycleState.DELETED);
updateContainerCountDrift(HddsProtos.LifeCycleState.DELETED, total);
} catch (Exception e) {
LOG.warn("DELETED sync: unable to update pre-sync count drift metric.", e);
}
}

private void updateContainerCountDrift(HddsProtos.LifeCycleState state,
long scmCount) {
if (containerSyncMetrics == null) {
return;
}
long reconCount = containerManager.getContainerStateCount(state);
containerSyncMetrics.setContainerCountDrift(state,
scmCount - reconCount);
}

private void updateContainerSyncDuration(HddsProtos.LifeCycleState state,
long durationMs) {
if (containerSyncMetrics != null) {
containerSyncMetrics.setContainerSyncDurationMs(state, durationMs);
}
}

Expand Down
Loading