Skip to content

Commit

Permalink
HDDS-5118. Recover from failure during upgrade action. (#2179)
Browse files Browse the repository at this point in the history
  • Loading branch information
errose28 committed Apr 28, 2021
1 parent 40223ec commit 5927bd2
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 180 deletions.
Expand Up @@ -135,14 +135,6 @@ public void setLayoutVersion(int version) {
storageInfo.setLayoutVersion(version);
}

public void setUpgradeToLayoutVersion(int version) {
storageInfo.setUpgradingToLayoutVersion(version);
}

public void unsetUpgradeToLayoutVersion() {
storageInfo.unsetUpgradingToLayoutVersion();
}

public void setFirstUpgradeActionLayoutVersion(int version) {
storageInfo.setFirstUpgradeActionLayoutVersion(version);
}
Expand Down
Expand Up @@ -64,9 +64,6 @@ public class StorageInfo {
*/
private static final String LAYOUT_VERSION = "layoutVersion";

private static final String UPGRADING_TO_LAYOUT_VERSION =
"upgradingToLayoutVersion";

private static final String FIRST_UPGRADE_ACTION_LAYOUT_VERSION =
"firstUpgradeActionLayoutVersion";

Expand Down Expand Up @@ -100,7 +97,6 @@ public StorageInfo(NodeType type, File propertiesFile)
verifyClusterId();
verifyCreationTime();
verifyLayoutVersion();
verifyUpgradingToLayoutVersion();
}

public NodeType getNodeType() {
Expand All @@ -127,20 +123,11 @@ public int getLayoutVersion() {
return 0;
}

public int getUpgradingToLayoutVersion() {
String upgradingTo = properties.getProperty(UPGRADING_TO_LAYOUT_VERSION);
if (upgradingTo != null) {
return Integer.parseInt(upgradingTo);
}
return INVALID_LAYOUT_VERSION;
}

private void verifyLayoutVersion() {
String layout = getProperty(LAYOUT_VERSION);
if (layout == null) {
LOG.warn("Found " + STORAGE_FILE_VERSION + " file without any layout " +
"version. Defaulting to 0. This should happen only if a cluster is " +
"being upgraded from 0.5.0.");
"version. Defaulting to 0.");
setProperty(LAYOUT_VERSION, "0");
}
}
Expand Down Expand Up @@ -175,17 +162,6 @@ public void setLayoutVersion(int version) {
properties.setProperty(LAYOUT_VERSION, Integer.toString(version));
}

public void setUpgradingToLayoutVersion(int layoutVersion) {
//TODO: do we need to check consecutiveness of versions here?
// if so, then change setLayoutVersion to incLayoutVersion in APIs!
properties.setProperty(
UPGRADING_TO_LAYOUT_VERSION, Integer.toString(layoutVersion));
}

public void unsetUpgradingToLayoutVersion() {
properties.remove(UPGRADING_TO_LAYOUT_VERSION);
}

private void verifyNodeType(NodeType type)
throws InconsistentStorageStateException {
NodeType nodeType = getNodeType();
Expand All @@ -205,16 +181,6 @@ private void verifyClusterId()
}
}

private void verifyUpgradingToLayoutVersion()
throws InconsistentStorageStateException {
int upgradeMark = getUpgradingToLayoutVersion();
if (upgradeMark != INVALID_LAYOUT_VERSION) {
throw new InconsistentStorageStateException("Ozone Manager died during "
+ "a LayoutFeature upgrade.");
//TODO add recovery steps here, or point to a recovery doc.
}
}

private void verifyCreationTime() {
Long creationTime = getCreationTime();
Preconditions.checkNotNull(creationTime);
Expand Down
Expand Up @@ -24,9 +24,7 @@
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.FIRST_UPGRADE_START_ACTION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.INVALID_REQUEST;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.LAYOUT_FEATURE_FINALIZATION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.PERSIST_UPGRADE_TO_LAYOUT_VERSION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.PREFINALIZE_ACTION_VALIDATION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.REMOVE_UPGRADE_TO_LAYOUT_VERSION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeException.ResultCodes.UPDATE_LAYOUT_VERSION_FAILED;
import static org.apache.hadoop.ozone.upgrade.UpgradeFinalizer.Status.FINALIZATION_DONE;
import static org.apache.hadoop.ozone.upgrade.UpgradeFinalizer.Status.FINALIZATION_REQUIRED;
Expand Down Expand Up @@ -203,39 +201,24 @@ protected void finalizeUpgrade(Supplier<Storage> storageSuppplier)
LayoutFeature lf = (LayoutFeature) obj;
Storage layoutStorage = storageSuppplier.get();
Optional<? extends UpgradeAction> action = lf.action(ON_FINALIZE);
finalizeFeature(lf, layoutStorage, action);
runFinalizationAction(lf, action);
updateLayoutVersionInVersionFile(lf, layoutStorage);
versionManager.finalized(lf);
}
versionManager.completeFinalization();
}

protected void finalizeFeature(LayoutFeature feature, Storage config,
Optional<? extends UpgradeAction> action)
throws UpgradeException {
protected void runFinalizationAction(LayoutFeature feature,
Optional<?extends UpgradeAction> action) throws UpgradeException {

if (!action.isPresent()) {
emitNOOPMsg(feature.name());
return;
}

putFinalizationMarkIntoVersionFile(feature, config);

emitStartingFinalizationActionMsg(feature.name());

runOnFinalizeAction(feature, action.get());

emitFinishFinalizationActionMsg(feature.name());

removeFinalizationMarkFromVersionFile(feature, config);
}

private void runOnFinalizeAction(LayoutFeature feature, UpgradeAction action)
throws UpgradeException {
try {
action.execute(component);
} catch (Exception e) {
logFinalizationFailureAndThrow(e, feature.name());
} else {
try {
action.get().execute(component);
} catch (Exception e) {
logFinalizationFailureAndThrow(e, feature.name());
}
}
}

Expand Down Expand Up @@ -325,44 +308,6 @@ protected void updateLayoutVersionInVersionFile(LayoutFeature feature,
}
}

protected void putFinalizationMarkIntoVersionFile(LayoutFeature feature,
Storage config)
throws UpgradeException {
try {
emitUpgradeToLayoutVersionPersistingMsg(feature.name());

setUpgradeToLayoutVersionInStorage(feature.layoutVersion(), config);
persistStorage(config);

emitUpgradeToLayoutVersionPersistedMsg();
} catch (IOException e) {
logUpgradeToLayoutVersionPersistingFailureAndThrow(feature.name(), e);
}
}

protected void removeFinalizationMarkFromVersionFile(
LayoutFeature feature, Storage config) throws UpgradeException {
try {
emitRemovingUpgradeToLayoutVersionMsg(feature.name());

unsetUpgradeToLayoutVersionInStorage(config);
persistStorage(config);

emitRemovedUpgradeToLayoutVersionMsg();
} catch (IOException e) {
logUpgradeToLayoutVersionRemovalFailureAndThrow(feature.name(), e);
}
}

private void setUpgradeToLayoutVersionInStorage(int version,
Storage config) {
config.setUpgradeToLayoutVersion(version);
}

private void unsetUpgradeToLayoutVersionInStorage(Storage config) {
config.unsetUpgradeToLayoutVersion();
}

private int currentStoredLayoutVersion(Storage config) {
return config.getLayoutVersion();
}
Expand Down Expand Up @@ -391,37 +336,6 @@ protected void emitFinishedMsg() {
logAndEmit(msg);
}

protected void emitStartingFinalizationActionMsg(String feature) {
String msg = "Executing onFinalize action of feature: " + feature + ".";
logAndEmit(msg);
}

protected void emitFinishFinalizationActionMsg(String feature) {
String msg = "The feature " + feature + " is finalized.";
logAndEmit(msg);
}

private void emitUpgradeToLayoutVersionPersistingMsg(String feature) {
String msg = "Mark finalization of " + feature + " in VERSION file.";
logAndEmit(msg);
}

private void emitUpgradeToLayoutVersionPersistedMsg() {
String msg = "Finalization mark placed.";
logAndEmit(msg);
}

private void emitRemovingUpgradeToLayoutVersionMsg(String feature) {
String msg = "Remove finalization mark of " + feature
+ " feature from VERSION file.";
logAndEmit(msg);
}

private void emitRemovedUpgradeToLayoutVersionMsg() {
String msg = "Finalization mark removed.";
logAndEmit(msg);
}

protected void logAndEmit(String msg) {
LOG.info(msg);
msgs.offer(msg);
Expand All @@ -439,21 +353,6 @@ private void logLayoutVersionUpdateFailureAndThrow(IOException e)
logAndThrow(e, msg, UPDATE_LAYOUT_VERSION_FAILED);
}

private void logUpgradeToLayoutVersionPersistingFailureAndThrow(
String feature, IOException e
) throws UpgradeException {
String msg = "Failed to update VERSION file with the upgrading feature: "
+ feature + ".";
logAndThrow(e, msg, PERSIST_UPGRADE_TO_LAYOUT_VERSION_FAILED);
}

private void logUpgradeToLayoutVersionRemovalFailureAndThrow(
String feature, IOException e) throws UpgradeException {
String msg =
"Failed to unmark finalization of " + feature + " LayoutFeature.";
logAndThrow(e, msg, REMOVE_UPGRADE_TO_LAYOUT_VERSION_FAILED);
}

private void logAndThrow(Exception e, String msg, ResultCodes resultCode)
throws UpgradeException {
LOG.error(msg, e);
Expand Down
Expand Up @@ -56,15 +56,24 @@ enum UpgradeActionType {
// Run every time an un-finalized component is started up.
VALIDATE_IN_PREFINALIZE,

// Run exactly once when an upgraded cluster is detected with this new
// Run once when an upgraded cluster is started with this new
// layout version.
// If the action fails, it will be run again when the component is
// restarted.
// If updating the VERSION file fails, the action may be run again when the
// component is restarted, even if it finished successfully.
// NOTE 1 : This will not be run in a NEW cluster!
// NOTE 2 : This needs to be a backward compatible action until a DOWNGRADE
// hook is provided!
// hook is provided!
// Even if the action fails partway through, all on disk structures should
// still be in a backwards compatible state.
// NOTE 3 : These actions are not submitted through RATIS (TODO)
ON_FIRST_UPGRADE_START,

// Run exactly once during finalization of layout feature.
// Run once during finalization of the layout feature.
// If the action fails, it will be run again when finalization is retried.
// If updating the VERSION file fails, the action may be run again when
// finalization is retried, even if it finished successfully.
ON_FINALIZE
}
}
Expand Up @@ -105,8 +105,6 @@ public String toString() {
public enum ResultCodes {
OK,
INVALID_REQUEST,
PERSIST_UPGRADE_TO_LAYOUT_VERSION_FAILED,
REMOVE_UPGRADE_TO_LAYOUT_VERSION_FAILED,
UPDATE_LAYOUT_VERSION_FAILED,
LAYOUT_FEATURE_FINALIZATION_FAILED,
PREFINALIZE_ACTION_VALIDATION_FAILED,
Expand Down
16 changes: 14 additions & 2 deletions hadoop-hdds/docs/content/design/upgrade-dev-primer.md
Expand Up @@ -62,11 +62,23 @@ Annotation to specify upgrade action run during specific upgrade phases. Each la
#### VALIDATE_IN_PREFINALIZE
A ‘validation’ action run every time a component is started up with this layout feature being unfinalized.

- Example: Stopping a component if a new configuration is used prior to it being finalized.

- Example: Cleaning up from a failed ON_FINALIZE action that may have left on disk data in an inoperable state.
- Note that because the ON_FINALIZE action failed, the feature remains pre-finalized.

#### ON_FIRST_UPGRADE_START
A backward compatible action run exactly once when an upgraded cluster is detected with this new layout version.
A backward compatible action run once when an upgraded cluster is detected with this new layout version. This differs from VALIDATE_IN_PREFINALIZE because it will not be run again once it completes successfully.
This action will be run again if it fails partway through, and may be run again if another error occurs during the upgrade. The action must always leave on disk data in a backwards compatible state, even if it fails partway through, since it is being executed before finalization.

- Example: The new version expects data in a different location even when it is pre-finalized. The action creates a symlink in the new location pointing to the old location.

#### ON_FINALIZE
An action run exactly once during finalization of layout version (feature).
An action run once during finalization of layout version (feature). This action will be run again if it fails partway through, and may be run again if another error occurs during the upgrade. This is the only action permitted to make backwards incompatible changes to on disk structures, since finalization has been initiated by the time it is run. If a failure partway through could leave the component in an inoperable state, a cleanup action should be used in VALIDATE_IN_PREFINALIZE, which will be run when the component is restarted after a failure.

- Example: Adding a new RocksDB column family.

- Example: Logging a message saying a feature is being finalized.

## ‘Prepare’ the Ozone Manager
Used to flush all transactions to disk, take a DB snapshot, and purge the logs, leaving Ratis in a clean state without unapplied log entries. This prepares the OM for upgrades/downgrades so that no request in the log is applied to the database in the old version of the code in one OM, and the new version of the code in another OM.
Expand Down
Expand Up @@ -163,25 +163,17 @@ public void testFinalizationWithUpgradeAction() throws Exception {
OMUpgradeFinalizer finalizer = new OMUpgradeFinalizer(versionManager);
finalizer.finalize(CLIENT_ID, om);


Iterator<OMLayoutFeature> it = lfs.iterator();
OMLayoutFeature f = it.next();

// the first feature has an upgrade action, it calls the setUpgradeToLV
// method, and the action execution is checked by verifying on om.getVersion
verify(om.getOmStorage(), once())
.setUpgradeToLayoutVersion(f.layoutVersion());
// the first feature has an upgrade action, and the action execution is
// checked by verifying on om.getVersion
verify(om.getOmStorage(), once())
.setLayoutVersion(f.layoutVersion());
verify(om.getOmStorage(), once())
.unsetUpgradeToLayoutVersion();
verify(om, once()).getVersion();

// the second feature has a NOOP it should not call the setUpgradeToLV
// method, but should update the LV.
// The second feature has a NOOP, but should update the layout version.
f = it.next();
verify(om.getOmStorage(), never())
.setUpgradeToLayoutVersion(f.layoutVersion());
verify(om.getOmStorage(), once())
.setLayoutVersion(f.layoutVersion());

Expand Down Expand Up @@ -222,22 +214,14 @@ public void testFinalizationWithFailingUpgradeAction() throws Exception {
when(versionManager.getUpgradeState()).thenReturn(FINALIZATION_DONE);
}

// Verify that we have never removed the upgradeToLV from the storage
// as finalization of the first feature in the list fails.
// Also verify that we have never updated the LV.
// Verify that we have never updated the layout version.
Iterator<OMLayoutFeature> it = lfs.iterator();
OMLayoutFeature f = it.next();
verify(om.getOmStorage(), once())
.setUpgradeToLayoutVersion(f.layoutVersion());
verify(om.getOmStorage(), never())
.setLayoutVersion(f.layoutVersion());
verify(om.getOmStorage(), never())
.unsetUpgradeToLayoutVersion();

// Verify that we never got to the second feature.
f = it.next();
verify(om.getOmStorage(), never())
.setUpgradeToLayoutVersion(f.layoutVersion());
verify(om.getOmStorage(), never())
.setLayoutVersion(f.layoutVersion());

Expand Down

0 comments on commit 5927bd2

Please sign in to comment.