From 54d59802483d9abbe5923541eecd8a92eaab0023 Mon Sep 17 00:00:00 2001 From: garyhwang Date: Tue, 17 Jan 2023 16:27:15 +0800 Subject: [PATCH 1/4] Add a new metric that reflects the number of applications whose FinalStatus is FAILED --- .../server/resourcemanager/rmapp/RMAppImpl.java | 2 +- .../resourcemanager/scheduler/QueueMetrics.java | 16 +++++++++++++--- .../scheduler/SchedulerApplication.java | 5 +++-- .../scheduler/capacity/AbstractLeafQueue.java | 4 ++-- .../scheduler/capacity/CSQueueMetrics.java | 6 +++--- .../scheduler/capacity/CapacityScheduler.java | 7 ++++--- .../event/AppRemovedSchedulerEvent.java | 9 ++++++++- .../scheduler/fair/FairScheduler.java | 7 ++++--- .../scheduler/fifo/FifoScheduler.java | 7 ++++--- .../webapp/MetricsOverviewTable.java | 4 ++++ .../webapp/dao/ClusterMetricsInfo.java | 10 ++++++++++ .../webapp/dao/UserMetricsInfo.java | 6 ++++++ .../scheduler/AppMetricsChecker.java | 3 +++ .../scheduler/TestPartitionQueueMetrics.java | 7 ++++--- .../scheduler/TestQueueMetrics.java | 15 +++++++++------ .../scheduler/TestSchedulerUtils.java | 3 ++- .../TestAutoCreatedQueueDeletionPolicy.java | 3 ++- ...estCapacitySchedulerNewQueueAutoCreation.java | 5 +++-- .../scheduler/capacity/TestLeafQueue.java | 5 +++-- .../router/webapp/RouterWebServiceUtil.java | 2 ++ .../MockDefaultRequestInterceptorREST.java | 1 + .../TestFederationInterceptorRESTRetry.java | 3 +++ .../router/webapp/TestRouterWebServiceUtil.java | 5 +++++ 23 files changed, 99 insertions(+), 36 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index ca88b8be3281c..7834873174644 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -1522,7 +1522,7 @@ private void completeAndCleanupApp(RMAppImpl app) { // need to remove them from scheduler. if (app.recoveredFinalState == null) { app.handler.handle(new AppRemovedSchedulerEvent(app.applicationId, - finalState)); + finalState, app.getFinalApplicationStatus())); } app.handler.handle(new RMAppManagerEvent(app.applicationId, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java index 0bfee4d33500e..854d92ccb34b6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java @@ -43,6 +43,7 @@ import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.util.Sets; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.metrics.CustomResourceMetricValue; @@ -64,6 +65,7 @@ public class QueueMetrics implements MetricsSource { @Metric("# of apps completed") MutableCounterInt appsCompleted; @Metric("# of apps killed") MutableCounterInt appsKilled; @Metric("# of apps failed") MutableCounterInt appsFailed; + @Metric("# of apps finally failed") MutableCounterInt appsFinalFailed; @Metric("# of Unmanaged apps submitted") private MutableCounterInt unmanagedAppsSubmitted; @@ -498,7 +500,7 @@ public void finishAppAttempt(ApplicationId appId, boolean isPending, } } - public void finishApp(String user, RMAppState rmAppFinalState, + public void finishApp(String user, RMAppState rmAppFinalState, FinalApplicationStatus finalApplicationStatus, boolean unmanagedAM) { switch (rmAppFinalState) { case KILLED: appsKilled.incr(); break; @@ -506,6 +508,10 @@ public void finishApp(String user, RMAppState rmAppFinalState, default: appsCompleted.incr(); break; } + if (finalApplicationStatus == FinalApplicationStatus.FAILED) { + appsFinalFailed.incr(); + } + if(unmanagedAM) { switch (rmAppFinalState) { case KILLED: @@ -522,10 +528,10 @@ public void finishApp(String user, RMAppState rmAppFinalState, QueueMetrics userMetrics = getUserMetrics(user); if (userMetrics != null) { - userMetrics.finishApp(user, rmAppFinalState, unmanagedAM); + userMetrics.finishApp(user, rmAppFinalState, finalApplicationStatus, unmanagedAM); } if (parent != null) { - parent.finishApp(user, rmAppFinalState, unmanagedAM); + parent.finishApp(user, rmAppFinalState, finalApplicationStatus, unmanagedAM); } } @@ -1138,6 +1144,10 @@ public int getAppsFailed() { return appsFailed.value(); } + public int getAppsFinalFailed() { + return appsFinalFailed.value(); + } + public int getUnmanagedAppsFailed() { return unmanagedAppsFailed.value(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java index fce7b551c6142..7171c75b98604 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplication.java @@ -19,6 +19,7 @@ import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceStability.Unstable; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; @@ -67,8 +68,8 @@ public void setCurrentAppAttempt(T currentAttempt) { this.currentAttempt = currentAttempt; } - public void stop(RMAppState rmAppFinalState) { - queue.getMetrics().finishApp(user, rmAppFinalState, isUnmanagedAM()); + public void stop(RMAppState rmAppFinalState, FinalApplicationStatus finalApplicationStatus) { + queue.getMetrics().finishApp(user, rmAppFinalState, finalApplicationStatus, isUnmanagedAM()); } public Priority getPriority() { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractLeafQueue.java index 08fedb578cab9..fdd7dc72e4a8a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractLeafQueue.java @@ -803,7 +803,7 @@ public Resource calculateAndGetAMResourceLimitPerPartition( resourceCalculator, queuePartitionUsableResource, amResourcePercent, queueAllocationSettings.getMinimumAllocation()); - usageTracker.getMetrics().setAMResouceLimit(nodePartition, amResouceLimit); + usageTracker.getMetrics().setAMResourceLimit(nodePartition, amResouceLimit); usageTracker.getQueueUsage().setAMLimit(nodePartition, amResouceLimit); LOG.debug("Queue: {}, node label : {}, queue partition resource : {}," + " queue current limit : {}, queue partition usable resource : {}," @@ -920,7 +920,7 @@ protected void activateApplications() { user.getResourceUsage().setAMLimit(partitionName, userAMLimit); usageTracker.getMetrics().incAMUsed(partitionName, application.getUser(), application.getAMResource(partitionName)); - usageTracker.getMetrics().setAMResouceLimitForUser(partitionName, + usageTracker.getMetrics().setAMResourceLimitForUser(partitionName, application.getUser(), userAMLimit); fsApp.remove(); LOG.info("Application " + applicationId + " from user: " + application diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java index 16ebc15512847..54be5c3eb2324 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java @@ -120,18 +120,18 @@ public long getUsedAMResourceVCores() { return usedAMResourceVCores.value(); } - public void setAMResouceLimit(String partition, Resource res) { + public void setAMResourceLimit(String partition, Resource res) { if(partition == null || partition.equals(RMNodeLabelsManager.NO_LABEL)) { AMResourceLimitMB.set(res.getMemorySize()); AMResourceLimitVCores.set(res.getVirtualCores()); } } - public void setAMResouceLimitForUser(String partition, + public void setAMResourceLimitForUser(String partition, String user, Resource res) { CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user); if (userMetrics != null) { - userMetrics.setAMResouceLimit(partition, res); + userMetrics.setAMResourceLimit(partition, res); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index e513359af0d63..7caf94b17ed6c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.ExecutionType; import org.apache.hadoop.yarn.api.records.NodeAttribute; import org.apache.hadoop.yarn.api.records.NodeId; @@ -1198,7 +1199,7 @@ private void addApplicationAttempt( } private void doneApplication(ApplicationId applicationId, - RMAppState finalState) { + RMAppState finalState, FinalApplicationStatus finalApplicationStatus) { writeLock.lock(); try { SchedulerApplication application = applications.get( @@ -1216,7 +1217,7 @@ private void doneApplication(ApplicationId applicationId, } else{ queue.finishApplication(applicationId, application.getUser()); } - application.stop(finalState); + application.stop(finalState, finalApplicationStatus); applications.remove(applicationId); } finally { writeLock.unlock(); @@ -2013,7 +2014,7 @@ public void handle(SchedulerEvent event) { { AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; doneApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); } break; case APP_ATTEMPT_ADDED: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java index 9842bed00b272..1e9d3a99d619c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/event/AppRemovedSchedulerEvent.java @@ -19,18 +19,21 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.event; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; public class AppRemovedSchedulerEvent extends SchedulerEvent { private final ApplicationId applicationId; private final RMAppState finalState; + private final FinalApplicationStatus finalApplicationStatus; public AppRemovedSchedulerEvent(ApplicationId applicationId, - RMAppState finalState) { + RMAppState finalState, FinalApplicationStatus finalApplicationStatus) { super(SchedulerEventType.APP_REMOVED); this.applicationId = applicationId; this.finalState = finalState; + this.finalApplicationStatus = finalApplicationStatus; } public ApplicationId getApplicationID() { @@ -40,4 +43,8 @@ public ApplicationId getApplicationID() { public RMAppState getFinalState() { return this.finalState; } + + public FinalApplicationStatus getFinalApplicationStatus() { + return finalApplicationStatus; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index a865d7543dd3d..0e9b421257755 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -29,6 +29,7 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NMToken; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.QueueACL; @@ -635,13 +636,13 @@ protected void addApplicationAttempt( } private void removeApplication(ApplicationId applicationId, - RMAppState finalState) { + RMAppState finalState, FinalApplicationStatus finalApplicationStatus) { SchedulerApplication application = applications.remove( applicationId); if (application == null) { LOG.warn("Couldn't find application " + applicationId); } else{ - application.stop(finalState); + application.stop(finalState, finalApplicationStatus); } } @@ -1262,7 +1263,7 @@ public void handle(SchedulerEvent event) { } AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; removeApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); break; case NODE_RESOURCE_UPDATE: if (!(event instanceof NodeResourceUpdateSchedulerEvent)) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java index 42b1ec32c099c..e3d069b299bb7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java @@ -32,6 +32,7 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.QueueACL; @@ -441,7 +442,7 @@ public synchronized void addApplication(ApplicationId applicationId, } private synchronized void doneApplication(ApplicationId applicationId, - RMAppState finalState) { + RMAppState finalState, FinalApplicationStatus finalApplicationStatus) { SchedulerApplication application = applications.get(applicationId); if (application == null){ @@ -452,7 +453,7 @@ private synchronized void doneApplication(ApplicationId applicationId, // Inform the activeUsersManager activeUsersManager.deactivateApplication(application.getUser(), applicationId); - application.stop(finalState); + application.stop(finalState, finalApplicationStatus); applications.remove(applicationId); } @@ -779,7 +780,7 @@ public void handle(SchedulerEvent event) { { AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; doneApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); } break; case APP_ATTEMPT_ADDED: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java index c9922964ff980..f85e65a626267 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/MetricsOverviewTable.java @@ -100,6 +100,7 @@ protected void render(Block html) { th().$class("ui-state-default").__("Apps Submitted").__(). th().$class("ui-state-default").__("Apps Pending").__(). th().$class("ui-state-default").__("Apps Running").__(). + th().$class("ui-state-default").__("Apps FinalFailed").__(). th().$class("ui-state-default").__("Apps Completed").__(). th().$class("ui-state-default").__("Containers Running").__(). th().$class("ui-state-default").__("Used Resources").__(). @@ -114,6 +115,7 @@ protected void render(Block html) { td(String.valueOf(clusterMetrics.getAppsSubmitted())). td(String.valueOf(clusterMetrics.getAppsPending())). td(String.valueOf(clusterMetrics.getAppsRunning())). + td(String.valueOf(clusterMetrics.getAppsFinalFailed())). td( String.valueOf( clusterMetrics.getAppsCompleted() + @@ -165,6 +167,7 @@ protected void render(Block html) { th().$class("ui-state-default").__("Apps Submitted").__(). th().$class("ui-state-default").__("Apps Pending").__(). th().$class("ui-state-default").__("Apps Running").__(). + th().$class("ui-state-default").__("Apps FinalFailed").__(). th().$class("ui-state-default").__("Apps Completed").__(). th().$class("ui-state-default").__("Containers Running").__(). th().$class("ui-state-default").__("Containers Pending").__(). @@ -182,6 +185,7 @@ protected void render(Block html) { td(String.valueOf(userMetrics.getAppsSubmitted())). td(String.valueOf(userMetrics.getAppsPending())). td(String.valueOf(userMetrics.getAppsRunning())). + td(String.valueOf(userMetrics.getAppsFinalFailed())). td( String.valueOf( (userMetrics.getAppsCompleted() + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java index e188fa0526894..ac695e4eb072d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/ClusterMetricsInfo.java @@ -38,6 +38,7 @@ public class ClusterMetricsInfo { private int appsRunning; private int appsFailed; private int appsKilled; + private int appsFinalFailed; private long reservedMB; private long availableMB; @@ -103,6 +104,7 @@ public ClusterMetricsInfo(final ResourceScheduler rs) { this.appsRunning = metrics.getAppsRunning(); this.appsFailed = metrics.getAppsFailed(); this.appsKilled = metrics.getAppsKilled(); + this.appsFinalFailed = metrics.getAppsFinalFailed(); this.reservedMB = metrics.getReservedMB(); this.availableMB = metrics.getAvailableMB(); @@ -193,6 +195,10 @@ public int getAppsKilled() { return appsKilled; } + public int getAppsFinalFailed() { + return appsFinalFailed; + } + public long getReservedMB() { return this.reservedMB; } @@ -321,6 +327,10 @@ public void setAppsKilled(int appsKilled) { this.appsKilled = appsKilled; } + public void setAppsFinalFailed(int appsFinalFailed) { + this.appsFinalFailed = appsFinalFailed; + } + public void setReservedMB(long reservedMB) { this.reservedMB = reservedMB; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/UserMetricsInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/UserMetricsInfo.java index b39c283029e38..9c2b7fc52b48f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/UserMetricsInfo.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/dao/UserMetricsInfo.java @@ -36,6 +36,7 @@ public class UserMetricsInfo { protected int appsRunning; protected int appsFailed; protected int appsKilled; + protected int appsFinalFailed; protected int runningContainers; protected int pendingContainers; protected int reservedContainers; @@ -67,6 +68,7 @@ public UserMetricsInfo(final ResourceManager rm, final String user) { this.appsRunning = userMetrics.getAppsRunning(); this.appsFailed = userMetrics.getAppsFailed(); this.appsKilled = userMetrics.getAppsKilled(); + this.appsFinalFailed = userMetrics.getAppsFinalFailed(); this.runningContainers = userMetrics.getAllocatedContainers(); this.pendingContainers = userMetrics.getPendingContainers(); @@ -110,6 +112,10 @@ public int getAppsKilled() { return appsKilled; } + public int getAppsFinalFailed() { + return appsFinalFailed; + } + public long getReservedMB() { return this.reservedMB; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppMetricsChecker.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppMetricsChecker.java index 19ee08f17cf2c..983fbf593ee6e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppMetricsChecker.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppMetricsChecker.java @@ -30,6 +30,7 @@ import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_COMPLETED; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FAILED; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_KILLED; +import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FINAL_FAILED; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_PENDING; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_RUNNING; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_SUBMITTED; @@ -52,6 +53,7 @@ final class AppMetricsChecker { .counter(APPS_COMPLETED, 0) .counter(APPS_FAILED, 0) .counter(APPS_KILLED, 0) + .counter(APPS_FINAL_FAILED, 0) .counter(UNMANAGED_APPS_SUBMITTED, 0) .gaugeInt(UNMANAGED_APPS_PENDING, 0) .gaugeInt(UNMANAGED_APPS_RUNNING, 0) @@ -66,6 +68,7 @@ enum AppMetricsKey { APPS_COMPLETED("AppsCompleted"), APPS_FAILED("AppsFailed"), APPS_KILLED("AppsKilled"), + APPS_FINAL_FAILED("AppsFinalFailed"), UNMANAGED_APPS_SUBMITTED("UnmanagedAppsSubmitted"), UNMANAGED_APPS_PENDING("UnmanagedAppsPending"), UNMANAGED_APPS_RUNNING("UnmanagedAppsRunning"), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java index 8424f195e29eb..47ce3fadc6c29 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java @@ -31,6 +31,7 @@ import org.apache.hadoop.metrics2.impl.MetricsSystemImpl; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueueMetrics; @@ -501,7 +502,7 @@ public void testTwoLevelWithUserMetrics() { metrics.finishAppAttempt(app.getApplicationId(), app.isPending(), app.getUser(), false); - metrics.finishApp(user, RMAppState.FINISHED, false); + metrics.finishApp(user, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED,false); } @Test @@ -625,7 +626,7 @@ public void testThreeLevelWithUserMetrics() { metrics1.finishAppAttempt(app.getApplicationId(), app.isPending(), app.getUser(), false); - metrics1.finishApp(user, RMAppState.FINISHED, false); + metrics1.finishApp(user, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED, false); } /** @@ -690,7 +691,7 @@ public void testSinglePartitionWithSingleLevelQueueMetricsWithoutUserMetrics() q1.finishAppAttempt(app.getApplicationId(), app.isPending(), app.getUser(), false); - q1.finishApp(user, RMAppState.FINISHED, false); + q1.finishApp(user, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); } public static MetricsSource partitionSource(MetricsSystem ms, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java index 2137285bac036..8dee1a2b667f4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java @@ -26,6 +26,7 @@ import org.apache.hadoop.test.MetricsAsserts; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; @@ -41,6 +42,7 @@ import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_COMPLETED; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FAILED; +import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FINAL_FAILED; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_PENDING; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_RUNNING; import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_SUBMITTED; @@ -160,7 +162,7 @@ public void testDefaultSingleQueueMetrics() { .counter(APPS_SUBMITTED, 1) .gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FINISHED, false); + metrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); AppMetricsChecker.createFromChecker(appMetricsChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(queueSource, true); @@ -238,10 +240,11 @@ public void testQueueAppMetricsForMultipleFailures() { .gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FAILED, false); + metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED,false); AppMetricsChecker.createFromChecker(appMetricsChecker) .gaugeInt(APPS_RUNNING, 0) .counter(APPS_FAILED, 1) + .counter(APPS_FINAL_FAILED, 1) .checkAgainst(queueSource, true); assertNull(userSource); @@ -319,10 +322,10 @@ public void testQueueUnmanagedAppMetricsForMultipleFailures() { .gaugeInt(UNMANAGED_APPS_RUNNING, 0).gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FAILED, true); + metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED,true); AppMetricsChecker.createFromChecker(appMetricsChecker) .gaugeInt(UNMANAGED_APPS_RUNNING, 0).gaugeInt(APPS_RUNNING, 0) - .counter(UNMANAGED_APPS_FAILED, 1).counter(APPS_FAILED, 1) + .counter(UNMANAGED_APPS_FAILED, 1).counter(APPS_FAILED, 1).counter(APPS_FINAL_FAILED, 1) .checkAgainst(queueSource, true); assertNull(userSource); @@ -443,7 +446,7 @@ public void testSingleQueueWithUserMetrics() { AppMetricsChecker.createFromChecker(appMetricsUserSourceChecker) .gaugeInt(APPS_RUNNING, 0) .checkAgainst(userSource, true); - metrics.finishApp(USER_2, RMAppState.FINISHED, false); + metrics.finishApp(USER_2, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); AppMetricsChecker.createFromChecker(appMetricsQueueSourceChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(queueSource, true); @@ -712,7 +715,7 @@ public void testTwoLevelWithUserMetrics() { .gaugeInt(APPS_RUNNING, 0) .checkAgainst(root.userSource, true); - leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, false); + leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED,false); AppMetricsChecker.createFromChecker(appMetricsQueueSourceChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(leaf.queueSource, true); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java index bc41c359c2e1a..a2a0b20ac916c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java @@ -58,6 +58,7 @@ import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NodeLabel; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.QueueInfo; @@ -1148,7 +1149,7 @@ public void testEnforcePartitionExclusivityMultipleLabels() { Assert.assertEquals("user", app.getUser()); AppRemovedSchedulerEvent appRemoveEvent = - new AppRemovedSchedulerEvent(appId, RMAppState.FINISHED); + new AppRemovedSchedulerEvent(appId, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED); handler.handle(appRemoveEvent); Assert.assertNull(applications.get(appId)); return app; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAutoCreatedQueueDeletionPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAutoCreatedQueueDeletionPolicy.java index 5359178d3aab0..541bc24c5e4d5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAutoCreatedQueueDeletionPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAutoCreatedQueueDeletionPolicy.java @@ -20,6 +20,7 @@ import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent; @@ -81,7 +82,7 @@ public void testEditSchedule() throws Exception { RMAppAttemptState.FINISHED, false); cs.handle(event); AppRemovedSchedulerEvent rEvent = new AppRemovedSchedulerEvent( - user0AppAttemptId.getApplicationId(), RMAppState.FINISHED); + user0AppAttemptId.getApplicationId(), RMAppState.FINISHED, FinalApplicationStatus.FAILED); cs.handle(rEvent); // There are no apps in user0 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNewQueueAutoCreation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNewQueueAutoCreation.java index 037312b716a05..4a2d8efd9e6ee 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNewQueueAutoCreation.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerNewQueueAutoCreation.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.util.Time; @@ -903,7 +904,7 @@ public void testCapacitySchedulerAutoQueueDeletion() throws Exception { RMAppAttemptState.FINISHED, false); cs.handle(event); AppRemovedSchedulerEvent rEvent = new AppRemovedSchedulerEvent( - a2App.getApplicationId(), RMAppState.FINISHED); + a2App.getApplicationId(), RMAppState.FINISHED, FinalApplicationStatus.FAILED); cs.handle(rEvent); // Now there are no apps in a2 queue. @@ -985,7 +986,7 @@ public void testCapacitySchedulerAutoQueueDeletionDisabled() RMAppAttemptState.FINISHED, false); cs.handle(event); AppRemovedSchedulerEvent rEvent = new AppRemovedSchedulerEvent( - a2App.getApplicationId(), RMAppState.FINISHED); + a2App.getApplicationId(), RMAppState.FINISHED, FinalApplicationStatus.FAILED); cs.handle(rEvent); // Now there are no apps in a2 queue. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java index eca065b148766..ce2f3b1632b30 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java @@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.QueueACL; @@ -541,7 +542,7 @@ public void testAppAttemptMetrics() throws Exception { RMAppAttemptState.FINISHED, false); cs.handle(event); AppRemovedSchedulerEvent rEvent = new AppRemovedSchedulerEvent( - appAttemptId_0.getApplicationId(), RMAppState.FINISHED); + appAttemptId_0.getApplicationId(), RMAppState.FINISHED, FinalApplicationStatus.FAILED); cs.handle(rEvent); assertEquals(1, a.getMetrics().getAppsSubmitted()); @@ -609,7 +610,7 @@ public void testUnmanagedAppAttemptMetrics() throws Exception { RMAppAttemptState.FINISHED, false); cs.handle(event); AppRemovedSchedulerEvent rEvent = new AppRemovedSchedulerEvent( - appAttemptId0.getApplicationId(), RMAppState.FINISHED); + appAttemptId0.getApplicationId(), RMAppState.FINISHED, FinalApplicationStatus.FAILED); cs.handle(rEvent); assertEquals(1, a.getMetrics().getUnmanagedAppsSubmitted()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServiceUtil.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServiceUtil.java index e33ce155079bf..9630948f938f1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServiceUtil.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/webapp/RouterWebServiceUtil.java @@ -463,6 +463,8 @@ public static void mergeMetrics(ClusterMetricsInfo metrics, metrics.getAppsFailed() + metricsResponse.getAppsFailed()); metrics.setAppsKilled( metrics.getAppsKilled() + metricsResponse.getAppsKilled()); + metrics.setAppsFinalFailed( + metrics.getAppsFinalFailed() + metricsResponse.getAppsFinalFailed()); metrics.setReservedMB( metrics.getReservedMB() + metricsResponse.getReservedMB()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockDefaultRequestInterceptorREST.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockDefaultRequestInterceptorREST.java index e2ac5fbf260cd..bbf8201d50d84 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockDefaultRequestInterceptorREST.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/MockDefaultRequestInterceptorREST.java @@ -337,6 +337,7 @@ public ClusterMetricsInfo getClusterMetricsInfo() { metrics.setAppsRunning(Integer.valueOf(getSubClusterId().getId())); metrics.setAppsFailed(Integer.valueOf(getSubClusterId().getId())); metrics.setAppsKilled(Integer.valueOf(getSubClusterId().getId())); + metrics.setAppsFinalFailed(Integer.valueOf(getSubClusterId().getId())); return metrics; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestFederationInterceptorRESTRetry.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestFederationInterceptorRESTRetry.java index 790cf410bed75..fb917f2dcd226 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestFederationInterceptorRESTRetry.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestFederationInterceptorRESTRetry.java @@ -491,6 +491,8 @@ private void checkMetricsFromGoodSC(ClusterMetricsInfo response) { response.getAppsFailed()); Assert.assertEquals(Integer.parseInt(good.getId()), response.getAppsKilled()); + Assert.assertEquals(Integer.parseInt(good.getId()), + response.getAppsFinalFailed()); } private void checkEmptyMetrics(ClusterMetricsInfo response) { @@ -500,6 +502,7 @@ private void checkEmptyMetrics(ClusterMetricsInfo response) { Assert.assertEquals(0, response.getAppsRunning()); Assert.assertEquals(0, response.getAppsFailed()); Assert.assertEquals(0, response.getAppsKilled()); + Assert.assertEquals(0, response.getAppsFinalFailed()); Assert.assertEquals(0, response.getReservedMB()); Assert.assertEquals(0, response.getAvailableMB()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestRouterWebServiceUtil.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestRouterWebServiceUtil.java index 96a6881adc6d0..3d2f17d8f1520 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestRouterWebServiceUtil.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/webapp/TestRouterWebServiceUtil.java @@ -444,6 +444,9 @@ public void testMergeMetrics() { Assert.assertEquals( metricsResponse.getAppsKilled() + metricsClone.getAppsKilled(), metrics.getAppsKilled()); + Assert.assertEquals( + metricsResponse.getAppsFinalFailed() + metricsClone.getAppsFinalFailed(), + metrics.getAppsFinalFailed()); Assert.assertEquals( metricsResponse.getReservedMB() + metricsClone.getReservedMB(), @@ -525,6 +528,7 @@ private ClusterMetricsInfo createClusterMetricsClone( metricsClone.setAppsRunning(metrics.getAppsRunning()); metricsClone.setAppsFailed(metrics.getAppsFailed()); metricsClone.setAppsKilled(metrics.getAppsKilled()); + metricsClone.setAppsFinalFailed(metrics.getAppsFinalFailed()); metricsClone.setReservedMB(metrics.getReservedMB()); metricsClone.setAvailableMB(metrics.getAvailableMB()); @@ -561,6 +565,7 @@ private void setUpClusterMetrics(ClusterMetricsInfo metrics, long seed) { metrics.setAppsRunning(rand.nextInt(1000)); metrics.setAppsFailed(rand.nextInt(1000)); metrics.setAppsKilled(rand.nextInt(1000)); + metrics.setAppsFinalFailed(rand.nextInt(1000)); metrics.setReservedMB(rand.nextInt(1000)); metrics.setAvailableMB(rand.nextInt(1000)); From ff0220ed3cc94e721d5f43a3b33167fb9ce506d2 Mon Sep 17 00:00:00 2001 From: garyhwang Date: Tue, 17 Jan 2023 17:35:46 +0800 Subject: [PATCH 2/4] add metrics description --- .../hadoop-yarn-site/src/site/markdown/ResourceManagerRest.md | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/ResourceManagerRest.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/ResourceManagerRest.md index 001fb6b5f24a0..f07f5c56d9dab 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/ResourceManagerRest.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/ResourceManagerRest.md @@ -162,6 +162,7 @@ The cluster metrics resource provides some overall metrics about the cluster. Mo | appsPending | int | The number of applications pending | | appsRunning | int | The number of applications running | | appsFailed | int | The number of applications failed | +| appsFinalFailed | int | The number of applications finally failed | | appsKilled | int | The number of applications killed | | reservedMB | long | The amount of memory reserved in MB | | availableMB | long | The amount of memory available in MB | From 5648a148274c9f14fa6e99985801a63e336df39d Mon Sep 17 00:00:00 2001 From: garyhwang Date: Tue, 17 Jan 2023 22:49:13 +0800 Subject: [PATCH 3/4] optimize code style --- .../scheduler/capacity/CapacityScheduler.java | 2 +- .../resourcemanager/scheduler/fair/FairScheduler.java | 2 +- .../resourcemanager/scheduler/fifo/FifoScheduler.java | 2 +- .../scheduler/TestPartitionQueueMetrics.java | 2 +- .../server/resourcemanager/scheduler/TestQueueMetrics.java | 6 +++--- .../resourcemanager/scheduler/TestSchedulerUtils.java | 3 ++- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 7caf94b17ed6c..009b471ed30a9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -2014,7 +2014,7 @@ public void handle(SchedulerEvent event) { { AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; doneApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); } break; case APP_ATTEMPT_ADDED: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 0e9b421257755..ddf5b51a0636e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -1263,7 +1263,7 @@ public void handle(SchedulerEvent event) { } AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; removeApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); break; case NODE_RESOURCE_UPDATE: if (!(event instanceof NodeResourceUpdateSchedulerEvent)) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java index e3d069b299bb7..10761eed6369a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java @@ -780,7 +780,7 @@ public void handle(SchedulerEvent event) { { AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; doneApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); + appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus()); } break; case APP_ATTEMPT_ADDED: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java index 47ce3fadc6c29..ccc240abac4bd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestPartitionQueueMetrics.java @@ -502,7 +502,7 @@ public void testTwoLevelWithUserMetrics() { metrics.finishAppAttempt(app.getApplicationId(), app.isPending(), app.getUser(), false); - metrics.finishApp(user, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED,false); + metrics.finishApp(user, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED, false); } @Test diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java index 8dee1a2b667f4..b8f75ead1dcff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java @@ -240,7 +240,7 @@ public void testQueueAppMetricsForMultipleFailures() { .gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED,false); + metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED, false); AppMetricsChecker.createFromChecker(appMetricsChecker) .gaugeInt(APPS_RUNNING, 0) .counter(APPS_FAILED, 1) @@ -322,7 +322,7 @@ public void testQueueUnmanagedAppMetricsForMultipleFailures() { .gaugeInt(UNMANAGED_APPS_RUNNING, 0).gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED,true); + metrics.finishApp(USER, RMAppState.FAILED, FinalApplicationStatus.FAILED, true); AppMetricsChecker.createFromChecker(appMetricsChecker) .gaugeInt(UNMANAGED_APPS_RUNNING, 0).gaugeInt(APPS_RUNNING, 0) .counter(UNMANAGED_APPS_FAILED, 1).counter(APPS_FAILED, 1).counter(APPS_FINAL_FAILED, 1) @@ -715,7 +715,7 @@ public void testTwoLevelWithUserMetrics() { .gaugeInt(APPS_RUNNING, 0) .checkAgainst(root.userSource, true); - leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED,false); + leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); AppMetricsChecker.createFromChecker(appMetricsQueueSourceChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(leaf.queueSource, true); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java index a2a0b20ac916c..581a083043175 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerUtils.java @@ -1149,7 +1149,8 @@ public void testEnforcePartitionExclusivityMultipleLabels() { Assert.assertEquals("user", app.getUser()); AppRemovedSchedulerEvent appRemoveEvent = - new AppRemovedSchedulerEvent(appId, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED); + new AppRemovedSchedulerEvent(appId, RMAppState.FINISHED, + FinalApplicationStatus.SUCCEEDED); handler.handle(appRemoveEvent); Assert.assertNull(applications.get(appId)); return app; From 2fc1d60e8bf365513121018a63a2b5778971c012 Mon Sep 17 00:00:00 2001 From: garyhwang Date: Tue, 17 Jan 2023 23:24:10 +0800 Subject: [PATCH 4/4] fix unit test --- .../server/resourcemanager/scheduler/TestQueueMetrics.java | 6 +++--- .../server/resourcemanager/webapp/TestRMWebServices.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java index b8f75ead1dcff..9614cdb08d1f0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestQueueMetrics.java @@ -162,7 +162,7 @@ public void testDefaultSingleQueueMetrics() { .counter(APPS_SUBMITTED, 1) .gaugeInt(APPS_RUNNING, 0) .checkAgainst(queueSource, true); - metrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); + metrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED, false); AppMetricsChecker.createFromChecker(appMetricsChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(queueSource, true); @@ -446,7 +446,7 @@ public void testSingleQueueWithUserMetrics() { AppMetricsChecker.createFromChecker(appMetricsUserSourceChecker) .gaugeInt(APPS_RUNNING, 0) .checkAgainst(userSource, true); - metrics.finishApp(USER_2, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); + metrics.finishApp(USER_2, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED, false); AppMetricsChecker.createFromChecker(appMetricsQueueSourceChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(queueSource, true); @@ -715,7 +715,7 @@ public void testTwoLevelWithUserMetrics() { .gaugeInt(APPS_RUNNING, 0) .checkAgainst(root.userSource, true); - leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.FAILED, false); + leaf.queueMetrics.finishApp(USER, RMAppState.FINISHED, FinalApplicationStatus.SUCCEEDED, false); AppMetricsChecker.createFromChecker(appMetricsQueueSourceChecker) .counter(APPS_COMPLETED, 1) .checkAgainst(leaf.queueSource, true); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java index b416947c55012..a04d6c51a0c4c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/webapp/TestRMWebServices.java @@ -476,7 +476,7 @@ public void verifyClusterMetricsJSON(JSONObject json) throws JSONException, Exception { assertEquals("incorrect number of elements", 1, json.length()); JSONObject clusterinfo = json.getJSONObject("clusterMetrics"); - assertEquals("incorrect number of elements", 35, clusterinfo.length()); + assertEquals("incorrect number of elements", 36, clusterinfo.length()); verifyClusterMetrics( clusterinfo.getInt("appsSubmitted"), clusterinfo.getInt("appsCompleted"), clusterinfo.getInt("reservedMB"), clusterinfo.getInt("availableMB"),