From f9a0e2338150f1bd3ba2c29f76979183fd3ed80c Mon Sep 17 00:00:00 2001 From: Sunil G Date: Thu, 17 Aug 2017 15:07:15 +0530 Subject: [PATCH] YARN-3254. HealthReport should include disk full information. Contributed by Suma Shivaprasad. --- .../nodemanager/DirectoryCollection.java | 61 ++++++++++++++++++- .../nodemanager/LocalDirsHandlerService.java | 59 ++++++++++++++---- .../nodemanager/TestDirectoryCollection.java | 23 +++++++ 3 files changed, 130 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java index ae2a4ef1ca478..502485f90918c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java @@ -38,6 +38,7 @@ import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.Path; @@ -99,6 +100,7 @@ static List concat(List l1, List l2) { private List localDirs; private List errorDirs; private List fullDirs; + private Map directoryErrorInfo; // read/write lock for accessing above directories. private final ReadLock readLock; @@ -192,6 +194,7 @@ public DirectoryCollection(String[] dirs, localDirs = new CopyOnWriteArrayList<>(dirs); errorDirs = new CopyOnWriteArrayList<>(); fullDirs = new CopyOnWriteArrayList<>(); + directoryErrorInfo = new ConcurrentHashMap<>(); ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); this.readLock = lock.readLock(); @@ -248,11 +251,25 @@ List getFailedDirs() { /** * @return the directories that have used all disk space */ - List getFullDirs() { this.readLock.lock(); try { - return fullDirs; + return Collections.unmodifiableList(fullDirs); + } finally { + this.readLock.unlock(); + } + } + + /** + * @return the directories that have errors - many not have appropriate permissions + * or other disk validation checks might have failed in {@link DiskValidator} + * + */ + @InterfaceStability.Evolving + List getErroredDirs() { + this.readLock.lock(); + try { + return Collections.unmodifiableList(errorDirs); } finally { this.readLock.unlock(); } @@ -270,6 +287,39 @@ int getNumFailures() { } } + /** + * + * @param dirName Absolute path of Directory for which error diagnostics are needed + * @return DiskErrorInformation - disk error diagnostics for the specified directory + * null - the disk associated with the directory has passed disk utilization checks + * /error validations in {@link DiskValidator} + * + */ + @InterfaceStability.Evolving + DiskErrorInformation getDirectoryErrorInfo(String dirName) { + this.readLock.lock(); + try { + return directoryErrorInfo.get(dirName); + } finally { + this.readLock.unlock(); + } + } + + /** + * + * @param dirName Absolute path of Directory for which the disk has been marked as unhealthy + * @return Check if disk associated with the directory is unhealthy + */ + @InterfaceStability.Evolving + boolean isDiskUnHealthy(String dirName) { + this.readLock.lock(); + try { + return directoryErrorInfo.containsKey(dirName); + } finally { + this.readLock.unlock(); + } + } + /** * Create any non-existent directories and parent directories, updating the * list of valid directories if necessary. @@ -297,6 +347,9 @@ boolean createNonExistentDirs(FileContext localFs, try { localDirs.remove(dir); errorDirs.add(dir); + directoryErrorInfo.put(dir, + new DiskErrorInformation(DiskErrorCause.OTHER, + "Cannot create directory : " + dir + ", error " + e.getMessage())); numFailures++; } finally { this.writeLock.unlock(); @@ -343,11 +396,13 @@ boolean checkDirs() { localDirs.clear(); errorDirs.clear(); fullDirs.clear(); + directoryErrorInfo.clear(); for (Map.Entry entry : dirsFailedCheck .entrySet()) { String dir = entry.getKey(); DiskErrorInformation errorInformation = entry.getValue(); + switch (entry.getValue().cause) { case DISK_FULL: fullDirs.add(entry.getKey()); @@ -359,6 +414,8 @@ boolean checkDirs() { LOG.warn(entry.getValue().cause + " is unknown for disk error."); break; } + directoryErrorInfo.put(entry.getKey(), errorInformation); + if (preCheckGoodDirs.contains(dir)) { LOG.warn("Directory " + dir + " error, " + errorInformation.message + ", removing from list of valid directories"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index f8cb4eee709f5..6e00808ae88db 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -53,6 +53,8 @@ public class LocalDirsHandlerService extends AbstractService { private static Log LOG = LogFactory.getLog(LocalDirsHandlerService.class); + private static final String diskCapacityExceededErrorMsg = "usable space is below configured utilization percentage/no more usable space"; + /** * Good local directories, use internally, * initial value is the same as NM_LOCAL_DIRS. @@ -344,21 +346,36 @@ public String getDisksHealthReport(boolean listGoodDirs) { } StringBuilder report = new StringBuilder(); - List failedLocalDirsList = localDirs.getFailedDirs(); - List failedLogDirsList = logDirs.getFailedDirs(); + List erroredLocalDirsList = localDirs.getErroredDirs(); + List erroredLogDirsList = logDirs.getErroredDirs(); + List diskFullLocalDirsList = localDirs.getFullDirs(); + List diskFullLogDirsList = logDirs.getFullDirs(); List goodLocalDirsList = localDirs.getGoodDirs(); List goodLogDirsList = logDirs.getGoodDirs(); - int numLocalDirs = goodLocalDirsList.size() + failedLocalDirsList.size(); - int numLogDirs = goodLogDirsList.size() + failedLogDirsList.size(); + + int numLocalDirs = goodLocalDirsList.size() + erroredLocalDirsList.size() + diskFullLocalDirsList.size(); + int numLogDirs = goodLogDirsList.size() + erroredLogDirsList.size() + diskFullLogDirsList.size(); if (!listGoodDirs) { - if (!failedLocalDirsList.isEmpty()) { - report.append(failedLocalDirsList.size() + "/" + numLocalDirs - + " local-dirs are bad: " - + StringUtils.join(",", failedLocalDirsList) + "; "); + if (!erroredLocalDirsList.isEmpty()) { + report.append(erroredLocalDirsList.size() + "/" + numLocalDirs + + " local-dirs have errors: " + + buildDiskErrorReport(erroredLocalDirsList, localDirs)); + } + if (!diskFullLocalDirsList.isEmpty()) { + report.append(diskFullLocalDirsList.size() + "/" + numLocalDirs + + " local-dirs " + diskCapacityExceededErrorMsg + + buildDiskErrorReport(diskFullLocalDirsList, localDirs) + "; "); } - if (!failedLogDirsList.isEmpty()) { - report.append(failedLogDirsList.size() + "/" + numLogDirs - + " log-dirs are bad: " + StringUtils.join(",", failedLogDirsList)); + + if (!erroredLogDirsList.isEmpty()) { + report.append(erroredLogDirsList.size() + "/" + numLogDirs + + " log-dirs have errors: " + + buildDiskErrorReport(erroredLogDirsList, logDirs)); + } + if (!diskFullLogDirsList.isEmpty()) { + report.append(diskFullLogDirsList.size() + "/" + numLogDirs + + " log-dirs " + diskCapacityExceededErrorMsg + + buildDiskErrorReport(diskFullLogDirsList, logDirs)); } } else { report.append(goodLocalDirsList.size() + "/" + numLocalDirs @@ -620,4 +637,24 @@ protected void updateMetrics() { logDirs.getGoodDirsDiskUtilizationPercentage()); } } + + private String buildDiskErrorReport(List dirs, DirectoryCollection directoryCollection) { + StringBuilder sb = new StringBuilder(); + + sb.append(" [ "); + for (int i = 0; i < dirs.size(); i++) { + final String dirName = dirs.get(i); + if ( directoryCollection.isDiskUnHealthy(dirName)) { + sb.append(dirName + " : " + directoryCollection.getDirectoryErrorInfo(dirName).message); + } else { + sb.append(dirName + " : " + "Unknown cause for disk error"); + } + + if ( i != (dirs.size() - 1)) { + sb.append(" , "); + } + } + sb.append(" ] "); + return sb.toString(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java index e529628b71011..095f21a4f4b76 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java @@ -128,8 +128,12 @@ public void testDiskSpaceUtilizationLimit() throws IOException { DirectoryCollection dc = new DirectoryCollection(dirs, 0.0F); dc.checkDirs(); Assert.assertEquals(0, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size()); + Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA)); + Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause); + // no good dirs Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage()); @@ -139,16 +143,21 @@ public void testDiskSpaceUtilizationLimit() throws IOException { testDir.getTotalSpace()); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertNull(dc.getDirectoryErrorInfo(dirA)); + Assert.assertEquals(utilizedSpacePerc, dc.getGoodDirsDiskUtilizationPercentage()); dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024)); dc.checkDirs(); Assert.assertEquals(0, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size()); + Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA)); // no good dirs Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage()); @@ -158,8 +167,11 @@ public void testDiskSpaceUtilizationLimit() throws IOException { testDir.getTotalSpace()); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertNull(dc.getDirectoryErrorInfo(dirA)); + Assert.assertEquals(utilizedSpacePerc, dc.getGoodDirsDiskUtilizationPercentage()); } @@ -209,12 +221,17 @@ public void testFailedDisksBecomingGoodAgain() throws Exception { Assert.assertEquals(0, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(1, dc.getFullDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); + Assert.assertNotNull(dc.getDirectoryErrorInfo(dirA)); + Assert.assertEquals(DirectoryCollection.DiskErrorCause.DISK_FULL, dc.getDirectoryErrorInfo(dirA).cause); dc.setDiskUtilizationPercentageCutoff(100.0F, 100.0F); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); + Assert.assertNull(dc.getDirectoryErrorInfo(dirA)); conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077"); @@ -232,12 +249,18 @@ public void testFailedDisksBecomingGoodAgain() throws Exception { Assert.assertEquals(0, dc.getGoodDirs().size()); Assert.assertEquals(1, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertEquals(1, dc.getErroredDirs().size()); + Assert.assertNotNull(dc.getDirectoryErrorInfo(dirB)); + Assert.assertEquals(DirectoryCollection.DiskErrorCause.OTHER, dc.getDirectoryErrorInfo(dirB).cause); + permDirB = new FsPermission((short) 0700); localFs.setPermission(pathB, permDirB); dc.checkDirs(); Assert.assertEquals(1, dc.getGoodDirs().size()); Assert.assertEquals(0, dc.getFailedDirs().size()); Assert.assertEquals(0, dc.getFullDirs().size()); + Assert.assertEquals(0, dc.getErroredDirs().size()); + Assert.assertNull(dc.getDirectoryErrorInfo(dirA)); } @Test