Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
Revert "Fix cron id collision bug by avoiding state in Quartz jobs"
This reverts commit e2ea191.

A bug was found where jobs that were killed via the KILL_EXISTING flag would set `path` as `null` in
`JobDataMap` that would block concurrent runs, but that value would never be set to `key` after the
the delayed run finished because it would run outside of the `Job` execution.

The issue in https://reviews.apache.org/r/65680/ will occur again, but it is rare and has been
around for a few years.

This bug was not caught in the unit test `testKillExisting` because `executeWithReplay` is mocked
and runs synchronously within the `Job` execution, allowing the `key` to be persisted.

Reviewed at https://reviews.apache.org/r/65810/
  • Loading branch information
jordanly committed Feb 27, 2018
1 parent d95fc2f commit e3f496a65eae03dde0392627f3fcde7b93a82df9
Showing 2 changed files with 11 additions and 15 deletions.
@@ -27,6 +27,7 @@

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;

import org.apache.aurora.common.stats.Stats;
import org.apache.aurora.common.stats.StatsProvider;
@@ -91,6 +92,7 @@ class AuroraCronJob implements Job, EventSubscriber {
private final StateManager stateManager;
private final BackoffHelper delayedStartBackoff;
private final BatchWorker<NoResult> batchWorker;
private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet();

/**
* Annotation for the max cron batch size.
@@ -143,16 +145,17 @@ void doExecute(JobExecutionContext context) throws JobExecutionException {

// Prevent a concurrent run for this job in case a previous trigger took longer to run.
// This approach relies on saving the "work in progress" token within the job context itself
// (see below).
// (see below) and relying on killFollowups to signal "work completion".
if (context.getJobDetail().getJobDataMap().containsKey(path)) {
CRON_JOB_CONCURRENT_RUNS.incrementAndGet();
if (context.getJobDetail().getJobDataMap().get(path) == null) {
if (killFollowups.contains(key)) {
context.getJobDetail().getJobDataMap().remove(path);
killFollowups.remove(key);
LOG.info("Resetting job context for cron {}", path);
} else {
LOG.info("Ignoring trigger as another concurrent run is active for cron {}", path);
return;
}

context.getJobDetail().getJobDataMap().remove(path);
LOG.info("Resetting job context for cron {}", path);
}

CompletableFuture<NoResult> scheduleResult = batchWorker.<NoResult>execute(storeProvider -> {
@@ -200,8 +203,6 @@ void doExecute(JobExecutionContext context) throws JobExecutionException {

LOG.info("Waiting for job to terminate before launching cron job " + path);
// Use job detail map to signal a "work in progress" condition to subsequent triggers.
// A null value indicates that work is still in progress, while a non-null value (key in
// this case) indicates that the work has completed.
context.getJobDetail().getJobDataMap().put(path, null);
batchWorker.executeWithReplay(
delayedStartBackoff.getBackoffStrategy(),
@@ -217,7 +218,7 @@ void doExecute(JobExecutionContext context) throws JobExecutionException {
}
})
.thenAccept(ignored -> {
context.getJobDetail().getJobDataMap().put(path, key);
killFollowups.add(key);
LOG.info("Finished delayed launch for cron " + path);
});
break;
@@ -50,7 +50,7 @@
import static org.easymock.EasyMock.eq;
import static org.easymock.EasyMock.expect;
import static org.easymock.EasyMock.expectLastCall;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class AuroraCronJobTest extends EasyMockTest {
@@ -157,18 +157,13 @@ public void testKillExisting() throws Exception {

// Simulate a trigger in progress.
jobDetails.getJobDataMap().put(JobKeys.canonicalString(AURORA_JOB_KEY), null);
assertEquals(
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
null);
assertFalse(jobDetails.getJobDataMap().isEmpty());

// Attempt a concurrent run that must be rejected.
auroraCronJob.doExecute(context);

// Complete previous run and trigger another one.
killResult.complete(BatchWorker.NO_RESULT);
assertEquals(
jobDetails.getJobDataMap().get(JobKeys.canonicalString(AURORA_JOB_KEY)),
AURORA_JOB_KEY);
auroraCronJob.doExecute(context);
assertTrue(jobDetails.getJobDataMap().isEmpty());
}

0 comments on commit e3f496a

Please sign in to comment.