Skip to content
Permalink
Browse files
JIRA-1141
closes #33
  • Loading branch information
Maja Kabiljo committed Apr 4, 2017
1 parent e354e06 commit c9621e30ba79b20322f449e24a2f438918ce33fa
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 3 deletions.
@@ -150,10 +150,13 @@ public boolean isDone(int expectedWorkersDone) {
return workersDone == expectedWorkersDone;
}

@Override
public String toString() {
/**
* Get string describing total job progress
*
* @return String describing total job progress
*/
protected String getProgressString() {
StringBuilder sb = new StringBuilder();
sb.append("Data from ").append(workersInSuperstep).append(" workers - ");
if (isInputSuperstep()) {
sb.append("Loading data: ");
if (!masterProgress.vertexInputSplitsSet() ||
@@ -189,6 +192,14 @@ public String toString() {
sb.append(partitionsStored).append(" out of ").append(
partitionsToStore).append(" partitions stored");
}
return sb.toString();
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Data from ").append(workersInSuperstep).append(" workers - ");
sb.append(getProgressString());
sb.append("; min free memory on worker ").append(
workerWithMinFreeMemory).append(" - ").append(
DECIMAL_FORMAT.format(minFreeMemoryMB)).append("MB, average ").append(
@@ -204,4 +215,19 @@ public String toString() {
}
return sb.toString();
}

/**
* Check if this instance made progress from another instance
*
* @param lastProgress Instance to compare with
* @return True iff progress was made
*/
public boolean madeProgressFrom(CombinedWorkerProgress lastProgress) {
// If progress strings are different there was progress made
if (!getProgressString().equals(lastProgress.getProgressString())) {
return true;
}
// If more workers were done there was progress made
return workersDone != lastProgress.workersDone;
}
}
@@ -20,6 +20,7 @@

import org.apache.giraph.conf.GiraphConfiguration;
import org.apache.giraph.conf.GiraphConstants;
import org.apache.giraph.conf.IntConfOption;
import org.apache.giraph.master.MasterProgress;
import org.apache.giraph.utils.ThreadUtils;
import org.apache.giraph.worker.WorkerProgress;
@@ -36,6 +37,12 @@
*/
public class DefaultJobProgressTrackerService
implements JobProgressTrackerService {
/** Max time job is allowed to not make progress before getting killed */
public static final IntConfOption MAX_ALLOWED_TIME_WITHOUT_PROGRESS_MS =
new IntConfOption(
"giraph.maxAllowedTimeWithoutProgressMs",
3 * 60 * 60 * 1000, // Allow 3h
"Max time job is allowed to not make progress before getting killed");
/** Class logger */
private static final Logger LOG =
Logger.getLogger(JobProgressTrackerService.class);
@@ -81,6 +88,10 @@ private void startWriterThread() {
writerThread = ThreadUtils.startThread(new Runnable() {
@Override
public void run() {
long lastTimeProgressChanged = -1;
long maxAllowedTimeWithoutProgress =
MAX_ALLOWED_TIME_WITHOUT_PROGRESS_MS.get(conf);
CombinedWorkerProgress lastProgress = null;
while (!finished) {
if (mappersStarted == conf.getMaxWorkers() + 1 &&
!workerProgresses.isEmpty()) {
@@ -95,6 +106,16 @@ public void run() {
if (combinedWorkerProgress.isDone(conf.getMaxWorkers())) {
break;
}

if (lastProgress == null ||
combinedWorkerProgress.madeProgressFrom(lastProgress)) {
lastProgress = combinedWorkerProgress;
lastTimeProgressChanged = System.currentTimeMillis();
} else if (lastTimeProgressChanged +
maxAllowedTimeWithoutProgress < System.currentTimeMillis()) {
killTooLongJob();
break;
}
}
if (!ThreadUtils.trySleep(UPDATE_MILLISECONDS)) {
break;
@@ -104,6 +125,21 @@ public void run() {
}, "progress-writer");
}

/**
* Kill the job which was taking too long to make any progress
*/
protected void killTooLongJob() {
// Job didn't make progress in too long, killing it
try {
LOG.error("Killing the job because it didn't make progress for " +
MAX_ALLOWED_TIME_WITHOUT_PROGRESS_MS.get(conf) / 1000 + "s");
job.killJob();
} catch (IOException e) {
LOG.error(
"Failed to kill the job which wasn't making progress", e);
}
}

@Override
public void setJob(Job job) {
this.job = job;

0 comments on commit c9621e3

Please sign in to comment.