Skip to content

Commit

Permalink
YARN-3273. Improve scheduler UI to facilitate scheduling analysis and…
Browse files Browse the repository at this point in the history
… debugging. Contributed Rohith Sharmaks
  • Loading branch information
jian-he committed Mar 18, 2015
1 parent 3bc72cc commit 658097d
Show file tree
Hide file tree
Showing 21 changed files with 371 additions and 62 deletions.
3 changes: 3 additions & 0 deletions hadoop-yarn-project/CHANGES.txt
Expand Up @@ -59,6 +59,9 @@ Release 2.8.0 - UNRELEASED
YARN-3243. CapacityScheduler should pass headroom from parent to children
to make sure ParentQueue obey its capacity limits. (Wangda Tan via jianhe)

YARN-3273. Improve scheduler UI to facilitate scheduling analysis and
debugging. (Rohith Sharmaks via jianhe)

OPTIMIZATIONS

YARN-3339. TestDockerContainerExecutor should pull a single image and not
Expand Down
Expand Up @@ -172,7 +172,7 @@ public Collection<ContainerReport> run() throws Exception {
._("Diagnostics Info:", appAttempt.getDiagnosticsInfo() == null ?
"" : appAttempt.getDiagnosticsInfo());

html._(InfoBlock.class);


if (exceptionWhenGetContainerReports) {
html
Expand All @@ -183,6 +183,19 @@ public Collection<ContainerReport> run() throws Exception {
return;
}

// TODO need to render applicationHeadRoom value from
// ApplicationAttemptMetrics after YARN-3284
if (webUiType.equals(YarnWebParams.RM_WEB_UI)) {
if (!isApplicationInFinalState(appAttempt.getAppAttemptState())) {
DIV<Hamlet> pdiv = html._(InfoBlock.class).div(_INFO_WRAP);
info("Application Attempt Overview").clear();
info("Application Attempt Metrics")._(
"Application Attempt Headroom : ", 0);
pdiv._();
}
}
html._(InfoBlock.class);

// Container Table
TBODY<TABLE<Hamlet>> tbody =
html.table("#containers").thead().tr().th(".id", "Container ID")
Expand Down Expand Up @@ -273,4 +286,10 @@ private boolean hasAMContainer(ContainerId containerId,
}
return false;
}

private boolean isApplicationInFinalState(YarnApplicationAttemptState state) {
return state == YarnApplicationAttemptState.FINISHED
|| state == YarnApplicationAttemptState.FAILED
|| state == YarnApplicationAttemptState.KILLED;
}
}
Expand Up @@ -41,6 +41,8 @@ public class RMAppAttemptMetrics {
private ApplicationAttemptId attemptId = null;
// preemption info
private Resource resourcePreempted = Resource.newInstance(0, 0);
// application headroom
private volatile Resource applicationHeadroom = Resource.newInstance(0, 0);
private AtomicInteger numNonAMContainersPreempted = new AtomicInteger(0);
private AtomicBoolean isPreempted = new AtomicBoolean(false);

Expand Down Expand Up @@ -145,4 +147,12 @@ public int[][] getLocalityStatistics() {
public int getTotalAllocatedContainers() {
return this.totalAllocatedContainers;
}

public Resource getApplicationAttemptHeadroom() {
return applicationHeadroom;
}

public void setApplicationAttemptHeadRoom(Resource headRoom) {
this.applicationHeadroom = headRoom;
}
}
Expand Up @@ -632,4 +632,14 @@ public void incNumAllocatedContainers(NodeType containerType,
requestType);
}
}

public void setApplicationHeadroomForMetrics(Resource headroom) {
RMAppAttempt attempt =
rmContext.getRMApps().get(attemptId.getApplicationId())
.getCurrentAppAttempt();
if (attempt != null) {
attempt.getRMAppAttemptMetrics().setApplicationAttemptHeadRoom(
Resources.clone(headroom));
}
}
}
Expand Up @@ -419,10 +419,13 @@ public synchronized User getUser(String userName) {
*/
public synchronized ArrayList<UserInfo> getUsers() {
ArrayList<UserInfo> usersToReturn = new ArrayList<UserInfo>();
for (Map.Entry<String, User> entry: users.entrySet()) {
usersToReturn.add(new UserInfo(entry.getKey(), Resources.clone(
entry.getValue().getUsed()), entry.getValue().getActiveApplications(),
entry.getValue().getPendingApplications()));
for (Map.Entry<String, User> entry : users.entrySet()) {
User user = entry.getValue();
usersToReturn.add(new UserInfo(entry.getKey(), Resources.clone(user
.getUsed()), user.getActiveApplications(), user
.getPendingApplications(), Resources.clone(user
.getConsumedAMResources()), Resources.clone(user
.getUserResourceLimit())));
}
return usersToReturn;
}
Expand Down Expand Up @@ -1068,7 +1071,7 @@ private Resource computeUserLimit(FiCaSchedulerApp application,
" clusterCapacity: " + clusterResource
);
}

user.setUserResourceLimit(limit);
return limit;
}

Expand Down Expand Up @@ -1738,6 +1741,7 @@ resourceCalculator, this, getParent(), clusterResource,
@VisibleForTesting
public static class User {
ResourceUsage userResourceUsage = new ResourceUsage();
volatile Resource userResourceLimit = Resource.newInstance(0, 0);
int pendingApplications = 0;
int activeApplications = 0;

Expand Down Expand Up @@ -1807,6 +1811,14 @@ public void releaseContainer(Resource resource, Set<String> nodeLabels) {
}
}
}

public Resource getUserResourceLimit() {
return userResourceLimit;
}

public void setUserResourceLimit(Resource userResourceLimit) {
this.userResourceLimit = userResourceLimit;
}
}

@Override
Expand Down
Expand Up @@ -32,14 +32,19 @@ public class UserInfo {
protected ResourceInfo resourcesUsed;
protected int numPendingApplications;
protected int numActiveApplications;
protected ResourceInfo AMResourceUsed;
protected ResourceInfo userResourceLimit;

UserInfo() {}

UserInfo(String username, Resource resUsed, int activeApps, int pendingApps) {
UserInfo(String username, Resource resUsed, int activeApps, int pendingApps,
Resource amResUsed, Resource resourceLimit) {
this.username = username;
this.resourcesUsed = new ResourceInfo(resUsed);
this.numActiveApplications = activeApps;
this.numPendingApplications = pendingApps;
this.AMResourceUsed = new ResourceInfo(amResUsed);
this.userResourceLimit = new ResourceInfo(resourceLimit);
}

public String getUsername() {
Expand All @@ -57,4 +62,12 @@ public int getNumPendingApplications() {
public int getNumActiveApplications() {
return numActiveApplications;
}

public ResourceInfo getAMResourcesUsed() {
return AMResourceUsed;
}

public ResourceInfo getUserResourceLimit() {
return userResourceLimit;
}
}
Expand Up @@ -268,7 +268,9 @@ public synchronized Allocation getAllocation(ResourceCalculator rc,
minimumAllocation, numCont);
ContainersAndNMTokensAllocation allocation =
pullNewlyAllocatedContainersAndNMTokens();
return new Allocation(allocation.getContainerList(), getHeadroom(), null,
Resource headroom = getHeadroom();
setApplicationHeadroomForMetrics(headroom);
return new Allocation(allocation.getContainerList(), headroom, null,
currentContPreemption, Collections.singletonList(rr),
allocation.getNMTokenList());
}
Expand Down
Expand Up @@ -938,9 +938,10 @@ clusterResource, minimumAllocation, getMaximumResourceCapability(),
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
ContainersAndNMTokensAllocation allocation =
application.pullNewlyAllocatedContainersAndNMTokens();
return new Allocation(allocation.getContainerList(),
application.getHeadroom(), preemptionContainerIds, null, null,
allocation.getNMTokenList());
Resource headroom = application.getHeadroom();
application.setApplicationHeadroomForMetrics(headroom);
return new Allocation(allocation.getContainerList(), headroom,
preemptionContainerIds, null, null, allocation.getNMTokenList());
}
}

Expand Down
Expand Up @@ -343,9 +343,10 @@ public Allocation allocate(
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
ContainersAndNMTokensAllocation allocation =
application.pullNewlyAllocatedContainersAndNMTokens();
return new Allocation(allocation.getContainerList(),
application.getHeadroom(), null, null, null,
allocation.getNMTokenList());
Resource headroom = application.getHeadroom();
application.setApplicationHeadroomForMetrics(headroom);
return new Allocation(allocation.getContainerList(), headroom, null,
null, null, allocation.getNMTokenList());
}
}

Expand Down
Expand Up @@ -37,6 +37,8 @@
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.DIV;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.LI;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TABLE;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TBODY;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.UL;
import org.apache.hadoop.yarn.webapp.view.HtmlBlock;
import org.apache.hadoop.yarn.webapp.view.InfoBlock;
Expand Down Expand Up @@ -67,41 +69,8 @@ static class LeafQueueInfoBlock extends HtmlBlock {
lqinfo = (CapacitySchedulerLeafQueueInfo) info.qinfo;
}

//Return a string describing one resource as a percentage of another
private String getPercentage(ResourceInfo numerator, ResourceInfo denominator) {
StringBuilder percentString = new StringBuilder("Memory: ");
if (numerator != null) {
percentString.append(numerator.getMemory());
}
if (denominator.getMemory() != 0) {
percentString.append(" (<span title='of used resources in this queue'>")
.append(StringUtils.format("%.2f", numerator.getMemory() * 100.0 /
denominator.getMemory()) + "%</span>)");
}
percentString.append(", vCores: ");
if (numerator != null) {
percentString.append(numerator.getvCores());
}
if (denominator.getvCores() != 0) {
percentString.append(" (<span title='of used resources in this queue'>")
.append(StringUtils.format("%.2f", numerator.getvCores() * 100.0 /
denominator.getvCores()) + "%</span>)");
}
return percentString.toString();
}

@Override
protected void render(Block html) {
StringBuilder activeUserList = new StringBuilder("");
ResourceInfo usedResources = lqinfo.getResourcesUsed();
ArrayList<UserInfo> users = lqinfo.getUsers().getUsersList();
for (UserInfo entry: users) {
activeUserList.append(entry.getUsername()).append(" &lt;")
.append(getPercentage(entry.getResourcesUsed(), usedResources))
.append(", Schedulable Apps: " + entry.getNumActiveApplications())
.append(", Non-Schedulable Apps: " + entry.getNumPendingApplications())
.append("&gt;<br style='display:block'>"); //Force line break
}

ResponseInfo ri = info("\'" + lqinfo.getQueuePath().substring(5) + "\' Queue Status").
_("Queue State:", lqinfo.getQueueState()).
Expand All @@ -116,12 +85,12 @@ protected void render(Block html) {
_("Max Applications:", Integer.toString(lqinfo.getMaxApplications())).
_("Max Applications Per User:", Integer.toString(lqinfo.getMaxApplicationsPerUser())).
_("Max Application Master Resources:", lqinfo.getAMResourceLimit().toString()).
_("Used Application Master Resources:", lqinfo.getUsedAMResource().toString()).
_("Max Application Master Resources Per User:", lqinfo.getUserAMResourceLimit().toString()).
_("Configured Capacity:", percent(lqinfo.getCapacity() / 100)).
_("Configured Max Capacity:", percent(lqinfo.getMaxCapacity() / 100)).
_("Configured Minimum User Limit Percent:", Integer.toString(lqinfo.getUserLimit()) + "%").
_("Configured User Limit Factor:", String.format("%.1f", lqinfo.getUserLimitFactor())).
_r("Active Users: ", activeUserList.toString()).
_("Accessible Node Labels:", StringUtils.join(",", lqinfo.getNodeLabels())).
_("Preemption:", lqinfo.getPreemptionDisabled() ? "disabled" : "enabled");

Expand All @@ -132,6 +101,44 @@ protected void render(Block html) {
}
}

static class QueueUsersInfoBlock extends HtmlBlock {
final CapacitySchedulerLeafQueueInfo lqinfo;

@Inject
QueueUsersInfoBlock(ViewContext ctx, CSQInfo info) {
super(ctx);
lqinfo = (CapacitySchedulerLeafQueueInfo) info.qinfo;
}

@Override
protected void render(Block html) {
TBODY<TABLE<Hamlet>> tbody =
html.table("#userinfo").thead().$class("ui-widget-header").tr().th()
.$class("ui-state-default")._("User Name")._().th()
.$class("ui-state-default")._("Max Resource")._().th()
.$class("ui-state-default")._("Used Resource")._().th()
.$class("ui-state-default")._("Max AM Resource")._().th()
.$class("ui-state-default")._("Used AM Resource")._().th()
.$class("ui-state-default")._("Schedulable Apps")._().th()
.$class("ui-state-default")._("Non-Schedulable Apps")._()._()._()
.tbody();

ArrayList<UserInfo> users = lqinfo.getUsers().getUsersList();
for (UserInfo userInfo : users) {
tbody.tr().td(userInfo.getUsername())
.td(userInfo.getUserResourceLimit().toString())
.td(userInfo.getResourcesUsed().toString())
.td(lqinfo.getUserAMResourceLimit().toString())
.td(userInfo.getAMResourcesUsed().toString())
.td(Integer.toString(userInfo.getNumActiveApplications()))
.td(Integer.toString(userInfo.getNumPendingApplications()))._();
}

html.div().$class("usersinfo").h5("Active Users Info")._();
tbody._()._();
}
}

public static class QueueBlock extends HtmlBlock {
final CSQInfo csqinfo;

Expand Down Expand Up @@ -166,6 +173,7 @@ public void render(Block html) {
csqinfo.qinfo = info;
if (info.getQueues() == null) {
li.ul("#lq").li()._(LeafQueueInfoBlock.class)._()._();
li.ul("#lq").li()._(QueueUsersInfoBlock.class)._()._();
} else {
li._(QueueBlock.class);
}
Expand Down
Expand Up @@ -21,6 +21,7 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ClusterMetricsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.SchedulerInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.UserMetricsInfo;

import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
Expand Down Expand Up @@ -153,6 +154,27 @@ protected void render(Block html) {

}
}

SchedulerInfo schedulerInfo=new SchedulerInfo(this.rm);

div.h3("Scheduler Metrics").
table("#schedulermetricsoverview").
thead().$class("ui-widget-header").
tr().
th().$class("ui-state-default")._("Scheduler Type")._().
th().$class("ui-state-default")._("Scheduling Resource Type")._().
th().$class("ui-state-default")._("Minimum Allocation")._().
th().$class("ui-state-default")._("Maximum Allocation")._().
_().
_().
tbody().$class("ui-widget-content").
tr().
td(String.valueOf(schedulerInfo.getSchedulerType())).
td(String.valueOf(schedulerInfo.getSchedulerResourceTypes())).
td(schedulerInfo.getMinAllocation().toString()).
td(schedulerInfo.getMaxAllocation().toString()).
_().
_()._();

div._();
}
Expand Down
Expand Up @@ -35,7 +35,8 @@ public class CapacitySchedulerLeafQueueInfo extends CapacitySchedulerQueueInfo {
protected int userLimit;
protected UsersInfo users; // To add another level in the XML
protected float userLimitFactor;
protected ResourceInfo aMResourceLimit;
protected ResourceInfo AMResourceLimit;
protected ResourceInfo usedAMResource;
protected ResourceInfo userAMResourceLimit;
protected boolean preemptionDisabled;

Expand All @@ -52,7 +53,8 @@ public class CapacitySchedulerLeafQueueInfo extends CapacitySchedulerQueueInfo {
userLimit = q.getUserLimit();
users = new UsersInfo(q.getUsers());
userLimitFactor = q.getUserLimitFactor();
aMResourceLimit = new ResourceInfo(q.getAMResourceLimit());
AMResourceLimit = new ResourceInfo(q.getAMResourceLimit());
usedAMResource = new ResourceInfo(q.getQueueResourceUsage().getAMUsed());
userAMResourceLimit = new ResourceInfo(q.getUserAMResourceLimit());
preemptionDisabled = q.getPreemptionDisabled();
}
Expand Down Expand Up @@ -91,9 +93,13 @@ public float getUserLimitFactor() {
}

public ResourceInfo getAMResourceLimit() {
return aMResourceLimit;
return AMResourceLimit;
}

public ResourceInfo getUsedAMResource() {
return usedAMResource;
}

public ResourceInfo getUserAMResourceLimit() {
return userAMResourceLimit;
}
Expand Down

0 comments on commit 658097d

Please sign in to comment.