Skip to content

Commit

Permalink
merge: #13886
Browse files Browse the repository at this point in the history
13886: fix(engine): cleanup orphaned job timeouts and backoffs on migration r=koevskinikola a=oleschoenburg

After an update from a previous version, both backoff and deadline column families might contain entries without a corresponding job or multiple entries for a single job. 
Before fixing #12797 and #13041, these were cleaned up ad-hoc whenever they were found. This is no longer the case because we now prevent the creation of duplicated entries and always cleanup properly.

This adds two necessary migrations that remove orphaned entries that were left by a previous version. The migrations run once and walk through all deadline and backoff entries, removing those without a job and duplicates which don't match the current job state.

closes #13881

Co-authored-by: Ole Schönburg <ole.schoenburg@gmail.com>
Co-authored-by: Meggle (Sebastian Bathke) <sebastian.bathke@camunda.com>
  • Loading branch information
3 people committed Aug 15, 2023
2 parents 2fd27d5 + dcb132b commit fe0117e
Show file tree
Hide file tree
Showing 7 changed files with 318 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,34 @@ public JobRecord updateJobRetries(final long jobKey, final int retries) {
return job;
}

@Override
public void cleanupTimeoutsWithoutJobs() {
deadlinesColumnFamily.whileTrue(
(key, value) -> {
final var jobKey = key.second().inner();
final var deadline = key.first().getValue();
final var job = jobsColumnFamily.get(jobKey);
if (job == null || job.getRecord().getDeadline() != deadline) {
deadlinesColumnFamily.deleteExisting(key);
}
return true;
});
}

@Override
public void cleanupBackoffsWithoutJobs() {
backoffColumnFamily.whileTrue(
(key, value) -> {
final var jobKey = key.second().inner();
final var backoff = key.first().getValue();
final var job = jobsColumnFamily.get(jobKey);
if (job == null || job.getRecord().getRetryBackoff() != backoff) {
backoffColumnFamily.deleteExisting(key);
}
return true;
});
}

private void createJob(final long key, final JobRecord record, final DirectBuffer type) {
createJobRecord(key, record);
initializeJobState();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ public class DbMigratorImpl implements DbMigrator {
new DecisionMigration(),
new DecisionRequirementsMigration(),
new ProcessInstanceByProcessDefinitionMigration(),
new ProcessDefinitionVersionMigration());
new ProcessDefinitionVersionMigration(),
new JobTimeoutCleanupMigration(),
new JobBackoffCleanupMigration());
// Be mindful of https://github.com/camunda/zeebe/issues/7248. In particular, that issue
// should be solved first, before adding any migration that can take a long time

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH under
* one or more contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright ownership.
* Licensed under the Zeebe Community License 1.1. You may not use this file
* except in compliance with the Zeebe Community License 1.1.
*/
package io.camunda.zeebe.engine.state.migration;

import io.camunda.zeebe.engine.state.mutable.MutableProcessingState;

public class JobBackoffCleanupMigration implements MigrationTask {

@Override
public String getIdentifier() {
return getClass().getSimpleName();
}

@Override
public void runMigration(final MutableProcessingState processingState) {
processingState.getJobState().cleanupBackoffsWithoutJobs();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH under
* one or more contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright ownership.
* Licensed under the Zeebe Community License 1.1. You may not use this file
* except in compliance with the Zeebe Community License 1.1.
*/
package io.camunda.zeebe.engine.state.migration;

import io.camunda.zeebe.engine.state.mutable.MutableProcessingState;

public class JobTimeoutCleanupMigration implements MigrationTask {

@Override
public String getIdentifier() {
return getClass().getSimpleName();
}

@Override
public void runMigration(final MutableProcessingState processingState) {
processingState.getJobState().cleanupTimeoutsWithoutJobs();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,8 @@ public interface MutableJobState extends JobState {
void resolve(long key, JobRecord updatedValue);

JobRecord updateJobRetries(long jobKey, int retries);

void cleanupTimeoutsWithoutJobs();

void cleanupBackoffsWithoutJobs();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH under
* one or more contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright ownership.
* Licensed under the Zeebe Community License 1.1. You may not use this file
* except in compliance with the Zeebe Community License 1.1.
*/
package io.camunda.zeebe.engine.state.migration;

import static org.assertj.core.api.Assertions.assertThat;

import io.camunda.zeebe.db.ColumnFamily;
import io.camunda.zeebe.db.TransactionContext;
import io.camunda.zeebe.db.ZeebeDb;
import io.camunda.zeebe.db.impl.DbCompositeKey;
import io.camunda.zeebe.db.impl.DbForeignKey;
import io.camunda.zeebe.db.impl.DbLong;
import io.camunda.zeebe.db.impl.DbNil;
import io.camunda.zeebe.engine.state.instance.JobRecordValue;
import io.camunda.zeebe.engine.state.mutable.MutableProcessingState;
import io.camunda.zeebe.engine.util.ProcessingStateExtension;
import io.camunda.zeebe.protocol.ZbColumnFamilies;
import io.camunda.zeebe.protocol.impl.record.value.job.JobRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;

@ExtendWith(ProcessingStateExtension.class)
public class JobBackoffCleanupMigrationTest {

final JobBackoffCleanupMigration jobBackoffCleanupMigration = new JobBackoffCleanupMigration();

private ZeebeDb<ZbColumnFamilies> zeebeDb;
private MutableProcessingState processingState;
private TransactionContext transactionContext;

private final JobRecordValue jobRecordToRead = new JobRecordValue();
private DbLong jobKey;
private ColumnFamily<DbLong, JobRecordValue> jobsColumnFamily;

private DbLong backoffKey;
private DbCompositeKey<DbLong, DbForeignKey<DbLong>> backoffJobKey;
private ColumnFamily<DbCompositeKey<DbLong, DbForeignKey<DbLong>>, DbNil> backoffColumnFamily;

@BeforeEach
public void setup() {
jobKey = new DbLong();
final DbForeignKey<DbLong> fkJob = new DbForeignKey<>(jobKey, ZbColumnFamilies.JOBS);
jobsColumnFamily =
zeebeDb.createColumnFamily(
ZbColumnFamilies.JOBS, transactionContext, jobKey, jobRecordToRead);

backoffKey = new DbLong();
backoffJobKey = new DbCompositeKey<>(backoffKey, fkJob);
backoffColumnFamily =
zeebeDb.createColumnFamily(
ZbColumnFamilies.JOB_BACKOFF, transactionContext, backoffJobKey, DbNil.INSTANCE);

jobKey.wrapLong(1);
}

@Test
public void afterCleanupValidTimeoutIsStillPresent() {
// given
final int deadline = 123;
jobsColumnFamily.upsert(jobKey, createJobRecordValue(deadline));
backoffKey.wrapLong(deadline);
backoffColumnFamily.upsert(backoffJobKey, DbNil.INSTANCE);

// when
jobBackoffCleanupMigration.runMigration(processingState);

// then
assertThat(backoffColumnFamily.exists(backoffJobKey)).isTrue();
}

@Test
public void afterCleanupOrphanedBackoffIsDeleted() {
// given
jobsColumnFamily.upsert(jobKey, new JobRecordValue());
backoffKey.wrapLong(123);
backoffColumnFamily.upsert(backoffJobKey, DbNil.INSTANCE);
jobsColumnFamily.deleteExisting(jobKey);

// when
jobBackoffCleanupMigration.runMigration(processingState);

// then
assertThat(backoffColumnFamily.exists(backoffJobKey)).isFalse();
}

@Test
public void afterCleanupTimeoutWithNonMatchingRetryBackoffIsDeleted() {
// given
final int firstRetryBackoff = 123;
final int secondRetryBackoff = 456;
jobsColumnFamily.upsert(jobKey, createJobRecordValue(secondRetryBackoff));
backoffKey.wrapLong(firstRetryBackoff);
backoffColumnFamily.upsert(backoffJobKey, DbNil.INSTANCE);
backoffKey.wrapLong(secondRetryBackoff);
backoffColumnFamily.upsert(backoffJobKey, DbNil.INSTANCE);

// when
jobBackoffCleanupMigration.runMigration(processingState);

// then
backoffKey.wrapLong(firstRetryBackoff);
assertThat(backoffColumnFamily.exists(backoffJobKey)).isFalse();
backoffKey.wrapLong(secondRetryBackoff);
assertThat(backoffColumnFamily.exists(backoffJobKey)).isTrue();
}

private static JobRecordValue createJobRecordValue(final long retryBackoff) {
final JobRecordValue jobRecordValue = new JobRecordValue();
jobRecordValue.setRecordWithoutVariables(new JobRecord().setRetryBackoff(retryBackoff));
return jobRecordValue;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* Copyright Camunda Services GmbH and/or licensed to Camunda Services GmbH under
* one or more contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright ownership.
* Licensed under the Zeebe Community License 1.1. You may not use this file
* except in compliance with the Zeebe Community License 1.1.
*/
package io.camunda.zeebe.engine.state.migration;

import static org.assertj.core.api.Assertions.assertThat;

import io.camunda.zeebe.db.ColumnFamily;
import io.camunda.zeebe.db.TransactionContext;
import io.camunda.zeebe.db.ZeebeDb;
import io.camunda.zeebe.db.impl.DbCompositeKey;
import io.camunda.zeebe.db.impl.DbForeignKey;
import io.camunda.zeebe.db.impl.DbLong;
import io.camunda.zeebe.db.impl.DbNil;
import io.camunda.zeebe.engine.state.instance.JobRecordValue;
import io.camunda.zeebe.engine.state.mutable.MutableProcessingState;
import io.camunda.zeebe.engine.util.ProcessingStateExtension;
import io.camunda.zeebe.protocol.ZbColumnFamilies;
import io.camunda.zeebe.protocol.impl.record.value.job.JobRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;

@ExtendWith(ProcessingStateExtension.class)
public class JobTimeoutCleanupMigrationTest {

final JobTimeoutCleanupMigration jobTimeoutCleanupMigration = new JobTimeoutCleanupMigration();

private ZeebeDb<ZbColumnFamilies> zeebeDb;
private MutableProcessingState processingState;
private TransactionContext transactionContext;

private final JobRecordValue jobRecordToRead = new JobRecordValue();
private DbLong jobKey;
private DbForeignKey<DbLong> fkJob;
private ColumnFamily<DbLong, JobRecordValue> jobsColumnFamily;

private DbLong deadlineKey;
private DbCompositeKey<DbLong, DbForeignKey<DbLong>> deadlineJobKey;
private ColumnFamily<DbCompositeKey<DbLong, DbForeignKey<DbLong>>, DbNil> deadlinesColumnFamily;

@BeforeEach
public void setup() {
jobKey = new DbLong();
fkJob = new DbForeignKey<>(jobKey, ZbColumnFamilies.JOBS);
jobsColumnFamily =
zeebeDb.createColumnFamily(
ZbColumnFamilies.JOBS, transactionContext, jobKey, jobRecordToRead);

deadlineKey = new DbLong();
deadlineJobKey = new DbCompositeKey<>(deadlineKey, fkJob);
deadlinesColumnFamily =
zeebeDb.createColumnFamily(
ZbColumnFamilies.JOB_DEADLINES, transactionContext, deadlineJobKey, DbNil.INSTANCE);

jobKey.wrapLong(1);
}

@Test
public void afterCleanupValidTimeoutIsStillPresent() {
// given
final int deadline = 123;
jobsColumnFamily.upsert(jobKey, createJobRecordValue(deadline));
deadlineKey.wrapLong(deadline);
deadlinesColumnFamily.upsert(deadlineJobKey, DbNil.INSTANCE);

// when
jobTimeoutCleanupMigration.runMigration(processingState);

// then
assertThat(deadlinesColumnFamily.exists(deadlineJobKey)).isTrue();
}

@Test
public void afterCleanupOrphanedTimeoutIsDeleted() {
// given
jobsColumnFamily.upsert(jobKey, new JobRecordValue());
deadlineKey.wrapLong(123);
deadlinesColumnFamily.upsert(deadlineJobKey, DbNil.INSTANCE);
jobsColumnFamily.deleteExisting(jobKey);

// when
jobTimeoutCleanupMigration.runMigration(processingState);

// then
assertThat(deadlinesColumnFamily.exists(deadlineJobKey)).isFalse();
}

@Test
public void afterCleanupTimeoutWithNonMatchingDeadlineIsDeleted() {
// given
final int firstDeadline = 123;
final int secondDeadline = 456;
jobsColumnFamily.upsert(jobKey, createJobRecordValue(secondDeadline));
deadlineKey.wrapLong(firstDeadline);
deadlinesColumnFamily.upsert(deadlineJobKey, DbNil.INSTANCE);
deadlineKey.wrapLong(secondDeadline);
deadlinesColumnFamily.upsert(deadlineJobKey, DbNil.INSTANCE);

// when
jobTimeoutCleanupMigration.runMigration(processingState);

// then
deadlineKey.wrapLong(firstDeadline);
assertThat(deadlinesColumnFamily.exists(deadlineJobKey)).isFalse();
deadlineKey.wrapLong(secondDeadline);
assertThat(deadlinesColumnFamily.exists(deadlineJobKey)).isTrue();
}

private static JobRecordValue createJobRecordValue(final long deadline) {
final JobRecordValue jobRecordValue = new JobRecordValue();
jobRecordValue.setRecordWithoutVariables(new JobRecord().setDeadline(deadline));
return jobRecordValue;
}
}

0 comments on commit fe0117e

Please sign in to comment.