From d8acbfe7da482d4cdf57958e56bb1f2c9297d250 Mon Sep 17 00:00:00 2001 From: tom lee Date: Tue, 25 Jan 2022 23:26:14 +0800 Subject: [PATCH 1/2] HDFS-16438. Avoid holding read locks for a long time when scanDatanodeStorage --- .../DatanodeAdminBackoffMonitor.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java index eb65b3843a9c4..ff98354451bb1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.server.namenode.INodeId; import org.apache.hadoop.hdfs.util.LightWeightHashSet; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; +import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.HashMap; @@ -92,6 +93,11 @@ public class DatanodeAdminBackoffMonitor extends DatanodeAdminMonitorBase */ private int pendingRepLimit; + /** + * The lock holding time threshold for {@link #scanDatanodeStorage}. + */ + private final long scanDatanodeStorageLockTimeMs = 300; + /** * The list of blocks which have been placed onto the replication queue * and are waiting to be sufficiently replicated. @@ -618,7 +624,7 @@ private int getYetToBeProcessedCount() { * * As this method does not schedule any blocks for reconstuction, this * scan can be performed under the namenode readlock, and the lock is - * dropped and reaquired for each storage on the DN. + * dropped and reaquired for a batch of blocks on each storage on the DN. * * @param dn - The datanode to process * @param initialScan - True is this is the first time scanning the node @@ -650,6 +656,7 @@ private void scanDatanodeStorage(DatanodeDescriptor dn, continue; } Iterator it = s.getBlockIterator(); + long beginTime = Time.monotonicNow(); while (it.hasNext()) { BlockInfo b = it.next(); if (!initialScan || dn.isEnteringMaintenance()) { @@ -665,6 +672,16 @@ private void scanDatanodeStorage(DatanodeDescriptor dn, blockList.put(b, null); } numBlocksChecked++; + if (Time.monotonicNow() - beginTime > scanDatanodeStorageLockTimeMs) { + namesystem.readUnlock(); + try { + Thread.sleep(1); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + namesystem.readLock(); + beginTime = Time.monotonicNow(); + } } } finally { namesystem.readUnlock(); From 95535acf43d116af5833f7ac7b7cf9ee78196f33 Mon Sep 17 00:00:00 2001 From: tom lee Date: Fri, 28 Jan 2022 11:05:13 +0800 Subject: [PATCH 2/2] moving the replicated check outside of the lock --- .../DatanodeAdminBackoffMonitor.java | 46 +++++++------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java index ff98354451bb1..caf88c89bf87c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminBackoffMonitor.java @@ -24,7 +24,6 @@ import org.apache.hadoop.hdfs.server.namenode.INodeId; import org.apache.hadoop.hdfs.util.LightWeightHashSet; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; -import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.HashMap; @@ -93,11 +92,6 @@ public class DatanodeAdminBackoffMonitor extends DatanodeAdminMonitorBase */ private int pendingRepLimit; - /** - * The lock holding time threshold for {@link #scanDatanodeStorage}. - */ - private final long scanDatanodeStorageLockTimeMs = 300; - /** * The list of blocks which have been placed onto the replication queue * and are waiting to be sufficiently replicated. @@ -624,7 +618,7 @@ private int getYetToBeProcessedCount() { * * As this method does not schedule any blocks for reconstuction, this * scan can be performed under the namenode readlock, and the lock is - * dropped and reaquired for a batch of blocks on each storage on the DN. + * dropped and reaquired for each storage on the DN. * * @param dn - The datanode to process * @param initialScan - True is this is the first time scanning the node @@ -656,37 +650,29 @@ private void scanDatanodeStorage(DatanodeDescriptor dn, continue; } Iterator it = s.getBlockIterator(); - long beginTime = Time.monotonicNow(); while (it.hasNext()) { BlockInfo b = it.next(); - if (!initialScan || dn.isEnteringMaintenance()) { - // this is a rescan, so most blocks should be replicated now, - // or this node is going into maintenance. On a healthy - // cluster using racks or upgrade domain, a node should be - // able to go into maintenance without replicating many blocks - // so we will check them immediately. - if (!isBlockReplicatedOk(dn, b, false, null)) { - blockList.put(b, null); - } - } else { - blockList.put(b, null); - } + blockList.put(b, null); numBlocksChecked++; - if (Time.monotonicNow() - beginTime > scanDatanodeStorageLockTimeMs) { - namesystem.readUnlock(); - try { - Thread.sleep(1); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - namesystem.readLock(); - beginTime = Time.monotonicNow(); - } } } finally { namesystem.readUnlock(); } } + if (!initialScan || dn.isEnteringMaintenance()) { + // this is a rescan, so most blocks should be replicated now, + // or this node is going into maintenance. On a healthy + // cluster using racks or upgrade domain, a node should be + // able to go into maintenance without replicating many blocks + // so we will check them immediately. + Iterator iterator = blockList.keySet().iterator(); + while(iterator.hasNext()) { + BlockInfo b = iterator.next(); + if (isBlockReplicatedOk(dn, b, false, null)) { + iterator.remove(); + } + } + } } /**