From 33d0140193ab6b91adf73629ab627738774325fc Mon Sep 17 00:00:00 2001 From: huangzhaobo Date: Tue, 26 Dec 2023 19:12:49 +0800 Subject: [PATCH] HDFS-17304. Update fsck -blockId to display slownode status of blocks. --- .../hdfs/server/namenode/NamenodeFsck.java | 7 ++- .../org/apache/hadoop/hdfs/tools/DFSck.java | 2 + .../hadoop/hdfs/server/namenode/TestFsck.java | 56 +++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java index bc6df0141c646..5f9946b15912b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java @@ -121,6 +121,7 @@ public class NamenodeFsck implements DataEncryptionKeyFactory { // return string marking fsck status public static final String CORRUPT_STATUS = "is CORRUPT"; public static final String HEALTHY_STATUS = "is HEALTHY"; + public static final String SLOW_STATUS = "is SLOW"; public static final String DECOMMISSIONING_STATUS = "is DECOMMISSIONING"; public static final String DECOMMISSIONED_STATUS = "is DECOMMISSIONED"; public static final String ENTERING_MAINTENANCE_STATUS = @@ -377,7 +378,11 @@ private void printDatanodeReplicaStatus(Block block, } else if (blockManager.isExcess(dn, blockManager.getStoredBlock(block))) { out.print(EXCESS_STATUS); } else { - out.print(HEALTHY_STATUS); + if (blockManager.getDatanodeManager().getAllSlowDataNodes().contains(dn)) { + out.print(SLOW_STATUS); + } else { + out.print(HEALTHY_STATUS); + } } out.print("\n"); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java index 9f0288ebf3f34..5973caeebcad0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java @@ -398,6 +398,8 @@ else if (args[idx].equals("-replicaDetails")) { errCode = 0; } else if (lastLine.endsWith(NamenodeFsck.EXCESS_STATUS)) { errCode = 0; + } else if (lastLine.endsWith(NamenodeFsck.SLOW_STATUS)) { + errCode = 0; } else if (lastLine.endsWith(NamenodeFsck.DECOMMISSIONED_STATUS)) { errCode = 2; } else if (lastLine.endsWith(NamenodeFsck.DECOMMISSIONING_STATUS)) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java index 892c5ce020a4d..e9bea99bfc641 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java @@ -18,6 +18,10 @@ package org.apache.hadoop.hdfs.server.namenode; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_MIN_OUTLIER_DETECTION_NODES_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_OUTLIERS_REPORT_INTERVAL_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PEER_STATS_ENABLED_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY; import static org.junit.Assert.assertEquals; @@ -58,6 +62,9 @@ import java.util.regex.Pattern; import java.util.function.Supplier; + +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ChecksumException; import org.apache.hadoop.fs.FSDataOutputStream; @@ -107,6 +114,7 @@ import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck.ErasureCodingResult; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; +import org.apache.hadoop.hdfs.server.protocol.OutlierMetrics; import org.apache.hadoop.hdfs.tools.DFSck; import org.apache.hadoop.hdfs.util.HostsFileWriter; import org.apache.hadoop.hdfs.util.StripedBlockUtil; @@ -1749,6 +1757,54 @@ public Boolean get() { } } + @Test + public void testBlockIdSlowNode() throws Exception { + conf = new Configuration(); + conf.setBoolean(DFS_DATANODE_PEER_STATS_ENABLED_KEY, true); + conf.set(DFS_DATANODE_OUTLIERS_REPORT_INTERVAL_KEY, "1000"); + conf.set(DFS_DATANODE_MIN_OUTLIER_DETECTION_NODES_KEY, "1"); + conf.set(DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_KEY, "1"); + try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build()) { + cluster.waitActive(); + final DistributedFileSystem fs = cluster.getFileSystem(); + + Path file = new Path("/testFile"); + long fileLength = 512; + DFSTestUtil.createFile(fs, file, fileLength, (short) 3, 0L); + DFSTestUtil.waitReplication(fs, file, (short) 3); + + List dataNodes = cluster.getDataNodes(); + assertEquals(3, dataNodes.size()); + dataNodes.get(0).getPeerMetrics().setTestOutliers(ImmutableMap.of( + dataNodes.get(1).getDatanodeHostname() + ":" + dataNodes.get(1).getIpcPort(), + new OutlierMetrics(1.0, 2.0, 3.0, 4.0))); + dataNodes.get(1).getPeerMetrics().setTestOutliers(ImmutableMap.of( + dataNodes.get(2).getDatanodeHostname() + ":" + dataNodes.get(2).getIpcPort(), + new OutlierMetrics(1.0, 2.0, 3.0, 4.0))); + dataNodes.get(2).getPeerMetrics().setTestOutliers(ImmutableMap.of( + dataNodes.get(0).getDatanodeHostname() + ":" + dataNodes.get(0).getIpcPort(), + new OutlierMetrics(1.0, 2.0, 3.0, 4.0))); + + GenericTestUtils.waitFor(() -> { + try { + DatanodeInfo[] slowNodeInfo = fs.getSlowDatanodeStats(); + LOG.info("Slow Datanode report: {}", Arrays.asList(slowNodeInfo)); + return slowNodeInfo.length == 3; + } catch (IOException e) { + LOG.error("Failed to retrieve SlowNode report", e); + return false; + } + }, 1000, 60000, "Slow nodes could not be detected"); + + List locatedBlocks = DFSTestUtil.getAllBlocks(fs, file); + assertEquals(1, locatedBlocks.size()); + String blockName = locatedBlocks.get(0).getBlock().getBlockName(); + + String outStr = runFsck(conf, 0, true, "/", "-blockId", blockName); + assertTrue(outStr.contains(NamenodeFsck.SLOW_STATUS)); + } + } + /** * Test for blockIdCK with block corruption. */