Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1522,7 +1522,7 @@ private void completeAndCleanupApp(RMAppImpl app) {
// need to remove them from scheduler.
if (app.recoveredFinalState == null) {
app.handler.handle(new AppRemovedSchedulerEvent(app.applicationId,
finalState));
finalState, app.getFinalApplicationStatus()));
}

app.handler.handle(new RMAppManagerEvent(app.applicationId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.util.Sets;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.metrics.CustomResourceMetricValue;
Expand All @@ -64,6 +65,7 @@ public class QueueMetrics implements MetricsSource {
@Metric("# of apps completed") MutableCounterInt appsCompleted;
@Metric("# of apps killed") MutableCounterInt appsKilled;
@Metric("# of apps failed") MutableCounterInt appsFailed;
@Metric("# of apps finally failed") MutableCounterInt appsFinalFailed;

@Metric("# of Unmanaged apps submitted")
private MutableCounterInt unmanagedAppsSubmitted;
Expand Down Expand Up @@ -498,14 +500,18 @@ public void finishAppAttempt(ApplicationId appId, boolean isPending,
}
}

public void finishApp(String user, RMAppState rmAppFinalState,
public void finishApp(String user, RMAppState rmAppFinalState, FinalApplicationStatus finalApplicationStatus,
boolean unmanagedAM) {
switch (rmAppFinalState) {
case KILLED: appsKilled.incr(); break;
case FAILED: appsFailed.incr(); break;
default: appsCompleted.incr(); break;
}

if (finalApplicationStatus == FinalApplicationStatus.FAILED) {
appsFinalFailed.incr();
}

if(unmanagedAM) {
switch (rmAppFinalState) {
case KILLED:
Expand All @@ -522,10 +528,10 @@ public void finishApp(String user, RMAppState rmAppFinalState,

QueueMetrics userMetrics = getUserMetrics(user);
if (userMetrics != null) {
userMetrics.finishApp(user, rmAppFinalState, unmanagedAM);
userMetrics.finishApp(user, rmAppFinalState, finalApplicationStatus, unmanagedAM);
}
if (parent != null) {
parent.finishApp(user, rmAppFinalState, unmanagedAM);
parent.finishApp(user, rmAppFinalState, finalApplicationStatus, unmanagedAM);
}
}

Expand Down Expand Up @@ -1138,6 +1144,10 @@ public int getAppsFailed() {
return appsFailed.value();
}

public int getAppsFinalFailed() {
return appsFinalFailed.value();
}

public int getUnmanagedAppsFailed() {
return unmanagedAppsFailed.value();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;

Expand Down Expand Up @@ -67,8 +68,8 @@ public void setCurrentAppAttempt(T currentAttempt) {
this.currentAttempt = currentAttempt;
}

public void stop(RMAppState rmAppFinalState) {
queue.getMetrics().finishApp(user, rmAppFinalState, isUnmanagedAM());
public void stop(RMAppState rmAppFinalState, FinalApplicationStatus finalApplicationStatus) {
queue.getMetrics().finishApp(user, rmAppFinalState, finalApplicationStatus, isUnmanagedAM());
}

public Priority getPriority() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ public Resource calculateAndGetAMResourceLimitPerPartition(
resourceCalculator, queuePartitionUsableResource, amResourcePercent,
queueAllocationSettings.getMinimumAllocation());

usageTracker.getMetrics().setAMResouceLimit(nodePartition, amResouceLimit);
usageTracker.getMetrics().setAMResourceLimit(nodePartition, amResouceLimit);
usageTracker.getQueueUsage().setAMLimit(nodePartition, amResouceLimit);
LOG.debug("Queue: {}, node label : {}, queue partition resource : {},"
+ " queue current limit : {}, queue partition usable resource : {},"
Expand Down Expand Up @@ -920,7 +920,7 @@ protected void activateApplications() {
user.getResourceUsage().setAMLimit(partitionName, userAMLimit);
usageTracker.getMetrics().incAMUsed(partitionName, application.getUser(),
application.getAMResource(partitionName));
usageTracker.getMetrics().setAMResouceLimitForUser(partitionName,
usageTracker.getMetrics().setAMResourceLimitForUser(partitionName,
application.getUser(), userAMLimit);
fsApp.remove();
LOG.info("Application " + applicationId + " from user: " + application
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,18 @@ public long getUsedAMResourceVCores() {
return usedAMResourceVCores.value();
}

public void setAMResouceLimit(String partition, Resource res) {
public void setAMResourceLimit(String partition, Resource res) {
if(partition == null || partition.equals(RMNodeLabelsManager.NO_LABEL)) {
AMResourceLimitMB.set(res.getMemorySize());
AMResourceLimitVCores.set(res.getVirtualCores());
}
}

public void setAMResouceLimitForUser(String partition,
public void setAMResourceLimitForUser(String partition,
String user, Resource res) {
CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
if (userMetrics != null) {
userMetrics.setAMResouceLimit(partition, res);
userMetrics.setAMResourceLimit(partition, res);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.ExecutionType;
import org.apache.hadoop.yarn.api.records.NodeAttribute;
import org.apache.hadoop.yarn.api.records.NodeId;
Expand Down Expand Up @@ -1198,7 +1199,7 @@ private void addApplicationAttempt(
}

private void doneApplication(ApplicationId applicationId,
RMAppState finalState) {
RMAppState finalState, FinalApplicationStatus finalApplicationStatus) {
writeLock.lock();
try {
SchedulerApplication<FiCaSchedulerApp> application = applications.get(
Expand All @@ -1216,7 +1217,7 @@ private void doneApplication(ApplicationId applicationId,
} else{
queue.finishApplication(applicationId, application.getUser());
}
application.stop(finalState);
application.stop(finalState, finalApplicationStatus);
applications.remove(applicationId);
} finally {
writeLock.unlock();
Expand Down Expand Up @@ -2013,7 +2014,7 @@ public void handle(SchedulerEvent event) {
{
AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event;
doneApplication(appRemovedEvent.getApplicationID(),
appRemovedEvent.getFinalState());
appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus());
}
break;
case APP_ATTEMPT_ADDED:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,21 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.event;

import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;

public class AppRemovedSchedulerEvent extends SchedulerEvent {

private final ApplicationId applicationId;
private final RMAppState finalState;
private final FinalApplicationStatus finalApplicationStatus;

public AppRemovedSchedulerEvent(ApplicationId applicationId,
RMAppState finalState) {
RMAppState finalState, FinalApplicationStatus finalApplicationStatus) {
super(SchedulerEventType.APP_REMOVED);
this.applicationId = applicationId;
this.finalState = finalState;
this.finalApplicationStatus = finalApplicationStatus;
}

public ApplicationId getApplicationID() {
Expand All @@ -40,4 +43,8 @@ public ApplicationId getApplicationID() {
public RMAppState getFinalState() {
return this.finalState;
}

public FinalApplicationStatus getFinalApplicationStatus() {
return finalApplicationStatus;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NMToken;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.QueueACL;
Expand Down Expand Up @@ -635,13 +636,13 @@ protected void addApplicationAttempt(
}

private void removeApplication(ApplicationId applicationId,
RMAppState finalState) {
RMAppState finalState, FinalApplicationStatus finalApplicationStatus) {
SchedulerApplication<FSAppAttempt> application = applications.remove(
applicationId);
if (application == null) {
LOG.warn("Couldn't find application " + applicationId);
} else{
application.stop(finalState);
application.stop(finalState, finalApplicationStatus);
}
}

Expand Down Expand Up @@ -1262,7 +1263,7 @@ public void handle(SchedulerEvent event) {
}
AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event;
removeApplication(appRemovedEvent.getApplicationID(),
appRemovedEvent.getFinalState());
appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus());
break;
case NODE_RESOURCE_UPDATE:
if (!(event instanceof NodeResourceUpdateSchedulerEvent)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueACL;
Expand Down Expand Up @@ -441,7 +442,7 @@ public synchronized void addApplication(ApplicationId applicationId,
}

private synchronized void doneApplication(ApplicationId applicationId,
RMAppState finalState) {
RMAppState finalState, FinalApplicationStatus finalApplicationStatus) {
SchedulerApplication<FifoAppAttempt> application =
applications.get(applicationId);
if (application == null){
Expand All @@ -452,7 +453,7 @@ private synchronized void doneApplication(ApplicationId applicationId,
// Inform the activeUsersManager
activeUsersManager.deactivateApplication(application.getUser(),
applicationId);
application.stop(finalState);
application.stop(finalState, finalApplicationStatus);
applications.remove(applicationId);
}

Expand Down Expand Up @@ -779,7 +780,7 @@ public void handle(SchedulerEvent event) {
{
AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event;
doneApplication(appRemovedEvent.getApplicationID(),
appRemovedEvent.getFinalState());
appRemovedEvent.getFinalState(), appRemovedEvent.getFinalApplicationStatus());
}
break;
case APP_ATTEMPT_ADDED:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ protected void render(Block html) {
th().$class("ui-state-default").__("Apps Submitted").__().
th().$class("ui-state-default").__("Apps Pending").__().
th().$class("ui-state-default").__("Apps Running").__().
th().$class("ui-state-default").__("Apps FinalFailed").__().
th().$class("ui-state-default").__("Apps Completed").__().
th().$class("ui-state-default").__("Containers Running").__().
th().$class("ui-state-default").__("Used Resources").__().
Expand All @@ -114,6 +115,7 @@ protected void render(Block html) {
td(String.valueOf(clusterMetrics.getAppsSubmitted())).
td(String.valueOf(clusterMetrics.getAppsPending())).
td(String.valueOf(clusterMetrics.getAppsRunning())).
td(String.valueOf(clusterMetrics.getAppsFinalFailed())).
td(
String.valueOf(
clusterMetrics.getAppsCompleted() +
Expand Down Expand Up @@ -165,6 +167,7 @@ protected void render(Block html) {
th().$class("ui-state-default").__("Apps Submitted").__().
th().$class("ui-state-default").__("Apps Pending").__().
th().$class("ui-state-default").__("Apps Running").__().
th().$class("ui-state-default").__("Apps FinalFailed").__().
th().$class("ui-state-default").__("Apps Completed").__().
th().$class("ui-state-default").__("Containers Running").__().
th().$class("ui-state-default").__("Containers Pending").__().
Expand All @@ -182,6 +185,7 @@ protected void render(Block html) {
td(String.valueOf(userMetrics.getAppsSubmitted())).
td(String.valueOf(userMetrics.getAppsPending())).
td(String.valueOf(userMetrics.getAppsRunning())).
td(String.valueOf(userMetrics.getAppsFinalFailed())).
td(
String.valueOf(
(userMetrics.getAppsCompleted() +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class ClusterMetricsInfo {
private int appsRunning;
private int appsFailed;
private int appsKilled;
private int appsFinalFailed;

private long reservedMB;
private long availableMB;
Expand Down Expand Up @@ -103,6 +104,7 @@ public ClusterMetricsInfo(final ResourceScheduler rs) {
this.appsRunning = metrics.getAppsRunning();
this.appsFailed = metrics.getAppsFailed();
this.appsKilled = metrics.getAppsKilled();
this.appsFinalFailed = metrics.getAppsFinalFailed();

this.reservedMB = metrics.getReservedMB();
this.availableMB = metrics.getAvailableMB();
Expand Down Expand Up @@ -193,6 +195,10 @@ public int getAppsKilled() {
return appsKilled;
}

public int getAppsFinalFailed() {
return appsFinalFailed;
}

public long getReservedMB() {
return this.reservedMB;
}
Expand Down Expand Up @@ -321,6 +327,10 @@ public void setAppsKilled(int appsKilled) {
this.appsKilled = appsKilled;
}

public void setAppsFinalFailed(int appsFinalFailed) {
this.appsFinalFailed = appsFinalFailed;
}

public void setReservedMB(long reservedMB) {
this.reservedMB = reservedMB;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class UserMetricsInfo {
protected int appsRunning;
protected int appsFailed;
protected int appsKilled;
protected int appsFinalFailed;
protected int runningContainers;
protected int pendingContainers;
protected int reservedContainers;
Expand Down Expand Up @@ -67,6 +68,7 @@ public UserMetricsInfo(final ResourceManager rm, final String user) {
this.appsRunning = userMetrics.getAppsRunning();
this.appsFailed = userMetrics.getAppsFailed();
this.appsKilled = userMetrics.getAppsKilled();
this.appsFinalFailed = userMetrics.getAppsFinalFailed();

this.runningContainers = userMetrics.getAllocatedContainers();
this.pendingContainers = userMetrics.getPendingContainers();
Expand Down Expand Up @@ -110,6 +112,10 @@ public int getAppsKilled() {
return appsKilled;
}

public int getAppsFinalFailed() {
return appsFinalFailed;
}

public long getReservedMB() {
return this.reservedMB;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_COMPLETED;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FAILED;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_KILLED;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_FINAL_FAILED;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_PENDING;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_RUNNING;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppMetricsChecker.AppMetricsKey.APPS_SUBMITTED;
Expand All @@ -52,6 +53,7 @@ final class AppMetricsChecker {
.counter(APPS_COMPLETED, 0)
.counter(APPS_FAILED, 0)
.counter(APPS_KILLED, 0)
.counter(APPS_FINAL_FAILED, 0)
.counter(UNMANAGED_APPS_SUBMITTED, 0)
.gaugeInt(UNMANAGED_APPS_PENDING, 0)
.gaugeInt(UNMANAGED_APPS_RUNNING, 0)
Expand All @@ -66,6 +68,7 @@ enum AppMetricsKey {
APPS_COMPLETED("AppsCompleted"),
APPS_FAILED("AppsFailed"),
APPS_KILLED("AppsKilled"),
APPS_FINAL_FAILED("AppsFinalFailed"),
UNMANAGED_APPS_SUBMITTED("UnmanagedAppsSubmitted"),
UNMANAGED_APPS_PENDING("UnmanagedAppsPending"),
UNMANAGED_APPS_RUNNING("UnmanagedAppsRunning"),
Expand Down
Loading