Skip to content

Commit

Permalink
HDDS-3642. Stop/Pause Background services while replacing OM DB with …
Browse files Browse the repository at this point in the history
…checkpoint from Leader (#1002)
  • Loading branch information
hanishakoneru committed Jun 14, 2020
1 parent e663992 commit aa04ac0
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 23 deletions.
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.hadoop.ozone.om;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -172,10 +173,12 @@ public void testInstallSnapshot() throws Exception {
followerOMLastAppliedIndex < leaderOMSnaphsotIndex);

// Install leader OM's db checkpoint on the lagging OM.
File oldDbLocation = followerOM.getMetadataManager().getStore()
.getDbLocation();
followerOM.getOmRatisServer().getOmStateMachine().pause();
followerOM.getMetadataManager().getStore().close();
followerOM.replaceOMDBWithCheckpoint(
leaderOMSnaphsotIndex, leaderDbCheckpoint.getCheckpointLocation());
followerOM.replaceOMDBWithCheckpoint(leaderOMSnaphsotIndex, oldDbLocation,
leaderDbCheckpoint.getCheckpointLocation());

// Reload the follower OM with new DB checkpoint from the leader OM.
followerOM.reloadOMState(leaderOMSnaphsotIndex, leaderOMSnapshotTermIndex);
Expand Down
Expand Up @@ -78,7 +78,6 @@
import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics;
import org.apache.hadoop.hdds.utils.RetriableTask;
import org.apache.hadoop.hdds.utils.db.DBCheckpoint;
import org.apache.hadoop.hdds.utils.db.DBStore;
import org.apache.hadoop.hdds.utils.db.DBUpdatesWrapper;
import org.apache.hadoop.hdds.utils.db.SequenceNumberNotFoundException;
import org.apache.hadoop.io.Text;
Expand Down Expand Up @@ -3038,14 +3037,20 @@ public TermIndex installSnapshot(String leaderId) {
return null;
}

// Pause the State Machine so that no new transactions can be applied.
// This action also clears the OM Double Buffer so that if there are any
// pending transactions in the buffer, they are discarded.
// TODO: The Ratis server should also be paused here. This is required
// because a leader election might happen while the snapshot
// installation is in progress and the new leader might start sending
// append log entries to the ratis server.
omRatisServer.getOmStateMachine().pause();
File oldDBLocation = metadataManager.getStore().getDbLocation();
try {
// Stop Background services
stopServices();

// Pause the State Machine so that no new transactions can be applied.
// This action also clears the OM Double Buffer so that if there are any
// pending transactions in the buffer, they are discarded.
omRatisServer.getOmStateMachine().pause();
} catch (Exception e) {
LOG.error("Failed to stop/ pause the services. Cannot proceed with " +
"installing the new checkpoint.", e);
return null;
}

//TODO: un-pause SM if any failures and retry?

Expand All @@ -3066,7 +3071,8 @@ public TermIndex installSnapshot(String leaderId) {

File dbBackup;
try {
dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, newDBlocation);
dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation,
newDBlocation);
} catch (Exception e) {
LOG.error("OM DB checkpoint replacement with new downloaded checkpoint " +
"failed.", e);
Expand Down Expand Up @@ -3116,6 +3122,12 @@ private DBCheckpoint getDBCheckpointFromLeader(String leaderId) {
return null;
}

void stopServices() throws Exception {
keyManager.stop();
stopSecretManager();
metadataManager.stop();
}

/**
* Replace the current OM DB with the new DB checkpoint.
*
Expand All @@ -3124,20 +3136,16 @@ private DBCheckpoint getDBCheckpointFromLeader(String leaderId) {
* @return location of the backup of the original DB
* @throws Exception
*/
File replaceOMDBWithCheckpoint(long lastAppliedIndex, Path checkpointPath)
throws Exception {
// Stop the DB first
DBStore store = metadataManager.getStore();
store.close();
File replaceOMDBWithCheckpoint(long lastAppliedIndex, File oldDB,
Path checkpointPath) throws Exception {

// Take a backup of the current DB
File db = store.getDbLocation();
String dbBackupName = OzoneConsts.OM_DB_BACKUP_PREFIX +
lastAppliedIndex + "_" + System.currentTimeMillis();
File dbBackup = new File(db.getParentFile(), dbBackupName);
File dbBackup = new File(oldDB.getParentFile(), dbBackupName);

try {
Files.move(db.toPath(), dbBackup.toPath());
Files.move(oldDB.toPath(), dbBackup.toPath());
} catch (IOException e) {
LOG.error("Failed to create a backup of the current DB. Aborting " +
"snapshot installation.");
Expand All @@ -3146,12 +3154,12 @@ File replaceOMDBWithCheckpoint(long lastAppliedIndex, Path checkpointPath)

// Move the new DB checkpoint into the om metadata dir
try {
Files.move(checkpointPath, db.toPath());
Files.move(checkpointPath, oldDB.toPath());
} catch (IOException e) {
LOG.error("Failed to move downloaded DB checkpoint {} to metadata " +
"directory {}. Resetting to original DB.", checkpointPath,
db.toPath());
Files.move(dbBackup.toPath(), db.toPath());
oldDB.toPath());
Files.move(dbBackup.toPath(), oldDB.toPath());
throw e;
}
return dbBackup;
Expand Down

0 comments on commit aa04ac0

Please sign in to comment.