Skip to content

Commit

Permalink
HDDS-8223. SCM delete block service shoud run wait for safemode to ex…
Browse files Browse the repository at this point in the history
…it. (#4432)
  • Loading branch information
guohao-rosicky committed Mar 24, 2023
1 parent 1eda443 commit 23e0ce7
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@

import javax.management.ObjectName;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.hadoop.hdds.client.BlockID;
Expand All @@ -33,7 +31,6 @@
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.ScmConfig;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.common.helpers.AllocatedBlock;
import org.apache.hadoop.hdds.scm.container.common.helpers.ExcludeList;
Expand All @@ -50,8 +47,6 @@

import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.INVALID_BLOCK_SIZE;
import static org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator.LOCAL_ID;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT;

import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
Expand Down Expand Up @@ -110,18 +105,12 @@ public BlockManagerImpl(final ConfigurationSource conf,
scm.getScmContext(),
scm.getSequenceIdGen(),
metrics);
Duration svcInterval = conf.getObject(
ScmConfig.class).getBlockDeletionInterval();
long serviceTimeout =
conf.getTimeDuration(
OZONE_BLOCK_DELETING_SERVICE_TIMEOUT,
OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT,
TimeUnit.MILLISECONDS);

blockDeletingService =
new SCMBlockDeletingService(deletedBlockLog,
scm.getScmNodeManager(), scm.getEventQueue(), scm.getScmContext(),
scm.getSCMServiceManager(), svcInterval, serviceTimeout, conf,
metrics);
scm.getSCMServiceManager(), conf,
metrics, scm.getSystemClock());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package org.apache.hadoop.hdds.scm.block;

import java.io.IOException;
import java.time.Duration;
import java.time.Clock;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
Expand All @@ -29,6 +29,7 @@
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.DeletedBlocksTransaction;
Expand Down Expand Up @@ -56,6 +57,9 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT;

/**
* A background service running in SCM to delete blocks. This service scans
* block deletion log in certain interval and caches block deletion commands
Expand Down Expand Up @@ -84,15 +88,29 @@ public class SCMBlockDeletingService extends BackgroundService
private final Lock serviceLock = new ReentrantLock();
private ServiceStatus serviceStatus = ServiceStatus.PAUSING;

private long safemodeExitMillis = 0;
private final long safemodeExitRunDelayMillis;
private final Clock clock;

@SuppressWarnings("parameternumber")
public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog,
NodeManager nodeManager, EventPublisher eventPublisher,
SCMContext scmContext, SCMServiceManager serviceManager,
Duration interval, long serviceTimeout,
ConfigurationSource conf,
ScmBlockDeletingServiceMetrics metrics) {
super("SCMBlockDeletingService", interval.toMillis(), TimeUnit.MILLISECONDS,
BLOCK_DELETING_SERVICE_CORE_POOL_SIZE, serviceTimeout);
ScmBlockDeletingServiceMetrics metrics,
Clock clock) {
super("SCMBlockDeletingService",
conf.getObject(ScmConfig.class).getBlockDeletionInterval().toMillis(),
TimeUnit.MILLISECONDS, BLOCK_DELETING_SERVICE_CORE_POOL_SIZE,
conf.getTimeDuration(OZONE_BLOCK_DELETING_SERVICE_TIMEOUT,
OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT,
TimeUnit.MILLISECONDS));

this.safemodeExitRunDelayMillis = conf.getTimeDuration(
HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT_DEFAULT,
TimeUnit.MILLISECONDS);
this.clock = clock;
this.deletedBlockLog = deletedBlockLog;
this.nodeManager = nodeManager;
this.eventPublisher = eventPublisher;
Expand Down Expand Up @@ -211,7 +229,9 @@ public void setBlockDeleteTXNum(int numTXs) {
public void notifyStatusChanged() {
serviceLock.lock();
try {
if (scmContext.isLeaderReady()) {
if (scmContext.isLeaderReady() && !scmContext.isInSafeMode() &&
serviceStatus != ServiceStatus.RUNNING) {
safemodeExitMillis = clock.millis();
serviceStatus = ServiceStatus.RUNNING;
} else {
serviceStatus = ServiceStatus.PAUSING;
Expand All @@ -225,7 +245,15 @@ public void notifyStatusChanged() {
public boolean shouldRun() {
serviceLock.lock();
try {
return serviceStatus == ServiceStatus.RUNNING;
long alreadyWaitTimeInMillis = clock.millis() - safemodeExitMillis;
boolean run = serviceStatus == ServiceStatus.RUNNING &&
(alreadyWaitTimeInMillis >= safemodeExitRunDelayMillis);
LOG.debug(
"Check scm block delete run: {} serviceStatus: {} " +
"safemodeExitRunDelayMillis: {} alreadyWaitTimeInMillis: {}",
run, serviceStatus, safemodeExitRunDelayMillis,
alreadyWaitTimeInMillis);
return run;
} finally {
serviceLock.unlock();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl
OZONE_ADMINISTRATORS
);

private Clock systemClock;

/**
* Creates a new StorageContainerManager. Configuration will be
* updated with information on the actual listening addresses used
Expand Down Expand Up @@ -597,7 +599,7 @@ private void initializeSystemManagers(OzoneConfiguration conf,
SCMConfigurator configurator) throws IOException {
// Use SystemClock when data is persisted
// and used again after system restarts.
Clock systemClock = Clock.system(ZoneOffset.UTC);
systemClock = Clock.system(ZoneOffset.UTC);

if (configurator.getNetworkTopology() != null) {
clusterMap = configurator.getNetworkTopology();
Expand Down Expand Up @@ -906,6 +908,10 @@ public void setScmCertificateClient(CertificateClient client) {
scmCertificateClient = client;
}

public Clock getSystemClock() {
return systemClock;
}

private ContainerTokenSecretManager createContainerTokenSecretManager(
OzoneConfiguration conf) throws IOException {

Expand Down

0 comments on commit 23e0ce7

Please sign in to comment.