Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LMDB corruption recovery -- follow-up #3880

Merged
merged 7 commits into from Oct 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions cf-agent/cf-agent.c
Expand Up @@ -84,6 +84,7 @@
#include <string_lib.h>
#include <cfnet.h>
#include <repair.h>
#include <dbm_api.h> /* CheckDBRepairFlagFile() */
#include <sys/types.h> /* checking umask on writing setxid log */
#include <sys/stat.h> /* checking umask on writing setxid log */

Expand Down Expand Up @@ -244,9 +245,10 @@ int main(int argc, char *argv[])
struct timespec start = BeginMeasure();

GenericAgentConfig *config = CheckOpts(argc, argv);
if (PERFORM_DB_CHECK)
bool force_repair = CheckDBRepairFlagFile();
if (force_repair || PERFORM_DB_CHECK)
{
repair_lmdb_default();
repair_lmdb_default(force_repair);
}
EvalContext *ctx = EvalContextNew();

Expand Down
110 changes: 94 additions & 16 deletions cf-check/repair.c
Expand Up @@ -11,7 +11,7 @@ int repair_main(ARG_UNUSED int argc, ARG_UNUSED const char *const *const argv)
return 1;
}

int repair_lmdb_default()
int repair_lmdb_default(ARG_UNUSED bool force)
{
Log(LOG_LEVEL_INFO,
"database repair not available on this platform/build");
Expand All @@ -30,6 +30,7 @@ int repair_lmdb_default()
#include <utilities.h>
#include <diagnose.h>
#include <string_lib.h>
#include <file_lib.h>
#include <replicate_lmdb.h>

static void print_usage(void)
Expand Down Expand Up @@ -78,9 +79,50 @@ int remove_files(Seq *files)
return failures;
}

int repair_lmdb_file(const char *file)
static bool record_repair_timestamp(int fd_tstamp)
{
time_t this_timestamp = time(NULL);
lseek(fd_tstamp, 0, SEEK_SET);
ssize_t n_written = write(fd_tstamp, &this_timestamp, sizeof(time_t));
if (n_written != sizeof(time_t))
{
/* should never happen */
return false;
}
return true;
}


/**
* @param file LMDB file to repair
* @param fd_tstamp An open FD to the repair timestamp file or -1
*
* @note If #fd_tstamp != -1 then it is expected to be open and with file locks
* taken care of. If #fd_tstamp == -1, this function opens the repair
* timestamp file on its own and takes care of the file locks.
*/
int repair_lmdb_file(const char *file, int fd_tstamp)
{
int ret;
char *dest_file = StringFormat("%s"REPAIR_FILE_EXTENSION, file);

FileLock lock = EMPTY_FILE_LOCK;
if (fd_tstamp == -1)
{
char *tstamp_file = StringFormat("%s.repaired", file);
int lock_ret = ExclusiveFileLockPath(&lock, tstamp_file, true); /* wait=true */
free(tstamp_file);
if (lock_ret < 0)
{
/* Should never happen because we tried to wait for the lock. */
Log(LOG_LEVEL_ERR,
"Failed to acquire lock for the '%s' DB repair timestamp file",
file);
ret = -1;
goto cleanup;
}
fd_tstamp = lock.fd;
}
pid_t child_pid = fork();
if (child_pid == 0)
{
Expand All @@ -95,23 +137,48 @@ int repair_lmdb_file(const char *file)
if (pid != child_pid)
{
/* real error that should never happen */
return -1;
ret = -1;
goto cleanup;
}
if (WIFEXITED(status) && WEXITSTATUS(status) != CF_CHECK_OK
&& WEXITSTATUS(status) != CF_CHECK_LMDB_CORRUPT_PAGE)
{
Log(LOG_LEVEL_ERR, "Failed to repair file '%s', removing", file);
unlink(file);
free(dest_file);
return WEXITSTATUS(status);
if (unlink(file) != 0)
{
Log(LOG_LEVEL_ERR, "Failed to remove file '%s'", file);
ret = -1;
}
else
{
if (!record_repair_timestamp(fd_tstamp))
{
Log(LOG_LEVEL_ERR, "Failed to write the timestamp of repair of the '%s' file",
file);
}
ret = WEXITSTATUS(status);
}
goto cleanup;
}
else if (WIFSIGNALED(status))
{
Log(LOG_LEVEL_ERR, "Failed to repair file '%s', child process signaled (%d), removing",
file, WTERMSIG(status));
unlink(file);
free(dest_file);
return signal_to_cf_check_code(WTERMSIG(status));
if (unlink(file) != 0)
{
Log(LOG_LEVEL_ERR, "Failed to remove file '%s'", file);
ret = -1;
}
else
{
if (!record_repair_timestamp(fd_tstamp))
{
Log(LOG_LEVEL_ERR, "Failed to write the timestamp of repair of the '%s' file",
file);
}
ret = signal_to_cf_check_code(WTERMSIG(status));
}
goto cleanup;
}
else
{
Expand All @@ -123,13 +190,24 @@ int repair_lmdb_file(const char *file)
"Failed to replace file '%s' with the repaired copy: %s",
file, strerror(errno));
unlink(dest_file);
free(dest_file);
return -1;
ret = -1;
goto cleanup;
}
free(dest_file);
return 0;
if (!record_repair_timestamp(fd_tstamp))
{
Log(LOG_LEVEL_ERR, "Failed to write the timestamp of repair of the '%s' file",
file);
}
ret = 0;
}
}
cleanup:
free(dest_file);
if (lock.fd != -1)
{
ExclusiveFileUnlock(&lock, true); /* close=true */
}
return ret;
}

int repair_lmdb_files(Seq *files, bool force)
Expand Down Expand Up @@ -167,7 +245,7 @@ int repair_lmdb_files(Seq *files, bool force)
for (int i = 0; i < length; ++i)
{
const char *file = SeqAt(files, i);
if (repair_lmdb_file(file) != 0)
if (repair_lmdb_file(file, -1) == -1)
{
ret++;
}
Expand Down Expand Up @@ -220,7 +298,7 @@ int repair_main(int argc, const char *const *const argv)
return ret;
}

int repair_lmdb_default()
int repair_lmdb_default(bool force)
{
// This function is used by cf-execd and cf-agent, not cf-check

Expand All @@ -240,7 +318,7 @@ int repair_lmdb_default()
Log(LOG_LEVEL_INFO, "Skipping local database repair, no lmdb files");
return 0;
}
const int ret = repair_lmdb_files(files, false);
const int ret = repair_lmdb_files(files, force);
SeqDestroy(files);

if (ret != 0)
Expand Down
4 changes: 2 additions & 2 deletions cf-check/repair.h
Expand Up @@ -4,7 +4,7 @@
#define REPAIR_FILE_EXTENSION ".copy"

int repair_main(int argc, const char *const *argv);
int repair_lmdb_default();
int repair_lmdb_file(const char *file);
int repair_lmdb_default(bool force);
int repair_lmdb_file(const char *file, int fd_tstamp);

#endif
7 changes: 5 additions & 2 deletions cf-execd/cf-execd.c
Expand Up @@ -41,6 +41,7 @@
#include <printsize.h>
#include <cleanup.h>
#include <repair.h>
#include <dbm_api.h> /* CheckDBRepairFlagFile() */
#include <string_lib.h>

#include <cf-windows-functions.h>
Expand Down Expand Up @@ -135,9 +136,10 @@ static const char *const HINTS[] =
int main(int argc, char *argv[])
{
GenericAgentConfig *config = CheckOpts(argc, argv);
if (PERFORM_DB_CHECK)
bool force_repair = CheckDBRepairFlagFile();
if (force_repair || PERFORM_DB_CHECK)
{
repair_lmdb_default();
repair_lmdb_default(force_repair);
}

EvalContext *ctx = EvalContextNew();
Expand Down Expand Up @@ -430,6 +432,7 @@ void StartServer(EvalContext *ctx, Policy *policy, GenericAgentConfig *config, E
WritePID("cf-execd.pid");
signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, HandleSignalsForDaemon);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
1 change: 1 addition & 0 deletions cf-key/cf-key.c
Expand Up @@ -132,6 +132,7 @@ static void SetupSignalsForCfKey(CfKeySigHandler sighandler)
signal(SIGTERM, sighandler);
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);
signal(SIGBUS, HandleSignalsForAgent);
signal(SIGUSR1, HandleSignalsForAgent);
signal(SIGUSR2, HandleSignalsForAgent);
}
Expand Down
1 change: 1 addition & 0 deletions cf-monitord/cf-monitord.c
Expand Up @@ -306,6 +306,7 @@ static void ThisAgentInit(EvalContext *ctx)

signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
1 change: 1 addition & 0 deletions cf-serverd/cf-serverd-functions.c
Expand Up @@ -476,6 +476,7 @@ static void InitSignals()

signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, HandleSignalsForDaemon);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
2 changes: 1 addition & 1 deletion libntech
2 changes: 2 additions & 0 deletions libpromises/cf3.defs.h
Expand Up @@ -94,6 +94,8 @@

#define CF_ENV_FILE "env_data"

#define CF_DB_REPAIR_TRIGGER "db_repair_required"

#define CF_SAVED ".cfsaved"
#define CF_EDITED ".cfedited"
#define CF_NEW ".cfnew"
Expand Down