Skip to content

Commit

Permalink
Force DB repair when any of our processes is killed with SIGBUS
Browse files Browse the repository at this point in the history
SIGBUS means unaligned memory access or an attempt to access area
outside of mmap(). In most cases for us this means an LMDB
corruption. Thus if a process gets SIGBUS, it touches a flag file
that triggers a DB repair attempt in the next agent or cf-execd
run.

Ticket: CFE-3127
Changelog: Received SIGBUS now triggers a repair of local DBs
  • Loading branch information
vpodzime committed Oct 9, 2019
1 parent 5dcfaa7 commit a4eb259
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 8 deletions.
18 changes: 16 additions & 2 deletions cf-agent/cf-agent.c
Expand Up @@ -244,9 +244,23 @@ int main(int argc, char *argv[])
struct timespec start = BeginMeasure();

GenericAgentConfig *config = CheckOpts(argc, argv);
if (PERFORM_DB_CHECK)
bool force_repair = false;
{
char repair_flag_file[PATH_MAX] = { 0 };
xsnprintf(repair_flag_file, PATH_MAX, "%s%c%s",
GetStateDir(), FILE_SEPARATOR, CF_DB_REPAIR_TRIGGER);
/* This is full of race-conditions, but it's just a best-effort
* thing. If a force-repair is missed, it will happen next time. If it's
* done twice, no big deal. */
if (access(repair_flag_file, F_OK) == 0)
{
force_repair = true;
unlink(repair_flag_file);
}
}
if (force_repair || PERFORM_DB_CHECK)
{
repair_lmdb_default();
repair_lmdb_default(force_repair);
}
EvalContext *ctx = EvalContextNew();

Expand Down
6 changes: 3 additions & 3 deletions cf-check/repair.c
Expand Up @@ -11,7 +11,7 @@ int repair_main(ARG_UNUSED int argc, ARG_UNUSED const char *const *const argv)
return 1;
}

int repair_lmdb_default()
int repair_lmdb_default(ARG_UNUSED bool force)
{
Log(LOG_LEVEL_INFO,
"database repair not available on this platform/build");
Expand Down Expand Up @@ -221,7 +221,7 @@ int repair_main(int argc, const char *const *const argv)
return ret;
}

int repair_lmdb_default()
int repair_lmdb_default(bool force)
{
// This function is used by cf-execd and cf-agent, not cf-check

Expand All @@ -241,7 +241,7 @@ int repair_lmdb_default()
Log(LOG_LEVEL_INFO, "Skipping local database repair, no lmdb files");
return 0;
}
const int ret = repair_lmdb_files(files, false);
const int ret = repair_lmdb_files(files, force);
SeqDestroy(files);

if (ret != 0)
Expand Down
2 changes: 1 addition & 1 deletion cf-check/repair.h
Expand Up @@ -4,7 +4,7 @@
#define REPAIR_FILE_EXTENSION ".copy"

int repair_main(int argc, const char *const *argv);
int repair_lmdb_default();
int repair_lmdb_default(bool force);
int repair_lmdb_file(const char *file);

#endif
19 changes: 17 additions & 2 deletions cf-execd/cf-execd.c
Expand Up @@ -135,9 +135,23 @@ static const char *const HINTS[] =
int main(int argc, char *argv[])
{
GenericAgentConfig *config = CheckOpts(argc, argv);
if (PERFORM_DB_CHECK)
bool force_repair = false;
{
repair_lmdb_default();
char repair_flag_file[PATH_MAX] = { 0 };
xsnprintf(repair_flag_file, PATH_MAX, "%s%c%s",
GetStateDir(), FILE_SEPARATOR, CF_DB_REPAIR_TRIGGER);
/* This is full of race-conditions, but it's just a best-effort
* thing. If a force-repair is missed, it will happen next time. If it's
* done twice, no big deal. */
if (access(repair_flag_file, F_OK) == 0)
{
force_repair = true;
unlink(repair_flag_file);
}
}
if (force_repair || PERFORM_DB_CHECK)
{
repair_lmdb_default(force_repair);
}

EvalContext *ctx = EvalContextNew();
Expand Down Expand Up @@ -430,6 +444,7 @@ void StartServer(EvalContext *ctx, Policy *policy, GenericAgentConfig *config, E
WritePID("cf-execd.pid");
signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, HandleSignalsForDaemon);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
1 change: 1 addition & 0 deletions cf-key/cf-key.c
Expand Up @@ -132,6 +132,7 @@ static void SetupSignalsForCfKey(CfKeySigHandler sighandler)
signal(SIGTERM, sighandler);
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);
signal(SIGBUS, HandleSignalsForAgent);
signal(SIGUSR1, HandleSignalsForAgent);
signal(SIGUSR2, HandleSignalsForAgent);
}
Expand Down
1 change: 1 addition & 0 deletions cf-monitord/cf-monitord.c
Expand Up @@ -306,6 +306,7 @@ static void ThisAgentInit(EvalContext *ctx)

signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
1 change: 1 addition & 0 deletions cf-serverd/cf-serverd-functions.c
Expand Up @@ -476,6 +476,7 @@ static void InitSignals()

signal(SIGINT, HandleSignalsForDaemon);
signal(SIGTERM, HandleSignalsForDaemon);
signal(SIGBUS, HandleSignalsForDaemon);
signal(SIGHUP, HandleSignalsForDaemon);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForDaemon);
Expand Down
2 changes: 2 additions & 0 deletions libpromises/cf3.defs.h
Expand Up @@ -94,6 +94,8 @@

#define CF_ENV_FILE "env_data"

#define CF_DB_REPAIR_TRIGGER "db_repair_required"

#define CF_SAVED ".cfsaved"
#define CF_EDITED ".cfedited"
#define CF_NEW ".cfnew"
Expand Down
1 change: 1 addition & 0 deletions libpromises/generic_agent.c
Expand Up @@ -2109,6 +2109,7 @@ void SetupSignalsForAgent(void)
{
signal(SIGINT, HandleSignalsForAgent);
signal(SIGTERM, HandleSignalsForAgent);
signal(SIGBUS, HandleSignalsForAgent);
signal(SIGHUP, SIG_IGN);
signal(SIGPIPE, SIG_IGN);
signal(SIGUSR1, HandleSignalsForAgent);
Expand Down
34 changes: 34 additions & 0 deletions libpromises/signals.c
Expand Up @@ -24,6 +24,8 @@

#include <signals.h>
#include <cleanup.h>
#include <known_dirs.h> /* GetStateDir() */
#include <file_lib.h> /* FILE_SEPARATOR */

static bool PENDING_TERMINATION = false; /* GLOBAL_X */

Expand Down Expand Up @@ -149,6 +151,22 @@ void HandleSignalsForAgent(int signum)
/* TODO don't exit from the signal handler, just set a flag. Reason is
* that all the cleanup() hooks we register are not reentrant. */
DoCleanupAndExit(0);
case SIGBUS:
/* SIGBUS almost certainly means a violation of mmap() area boundaries
* or some mis-aligned memory access. IOW, an LMDB corruption. */
{
char filename[PATH_MAX] = { 0 }; /* trying to avoid memory allocation */
xsnprintf(filename, PATH_MAX, "%s%c%s",
GetStateDir(), FILE_SEPARATOR, CF_DB_REPAIR_TRIGGER);
int fd = open(filename, O_CREAT|O_RDWR);
if (fd != -1)
{
close(fd);
}
/* else: we tried, nothing more to do in the limited environment of a
* signal handler */
}
break;
case SIGUSR1:
LogSetGlobalLevel(LOG_LEVEL_DEBUG);
break;
Expand Down Expand Up @@ -178,6 +196,22 @@ void HandleSignalsForDaemon(int signum)
case SIGKILL:
PENDING_TERMINATION = true;
break;
case SIGBUS:
/* SIGBUS almost certainly means a violation of mmap() area boundaries
* or some mis-aligned memory access. IOW, an LMDB corruption. */
{
char filename[PATH_MAX] = { 0 }; /* trying to avoid memory allocation */
xsnprintf(filename, PATH_MAX, "%s%c%s",
GetStateDir(), FILE_SEPARATOR, CF_DB_REPAIR_TRIGGER);
int fd = open(filename, O_CREAT|O_RDWR);
if (fd != -1)
{
close(fd);
}
/* else: we tried, nothing more to do in the limited environment of a
* signal handler */
}
break;
case SIGUSR1:
LogSetGlobalLevel(LOG_LEVEL_DEBUG);
break;
Expand Down

0 comments on commit a4eb259

Please sign in to comment.