Skip to content

Commit

Permalink
filedaemon: solve a race condition within the heartbeat thread shutdown
Browse files Browse the repository at this point in the history
  • Loading branch information
franku committed Aug 27, 2020
1 parent 2536984 commit a1c51d2
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
10 changes: 9 additions & 1 deletion core/src/filed/heartbeat.cc
Expand Up @@ -72,6 +72,8 @@ extern "C" void* sd_heartbeat_thread(void* arg)
jcr->impl->hb_dir_bsock = dir;
dir->suppress_error_msgs_ = true;
sd->suppress_error_msgs_ = true;
jcr->impl->hb_initialized_once =
true; // initialize last to avoid race condition

/* Hang reading the socket to the SD, and every time we get
* a heartbeat or we get a wait timeout (5 seconds), we
Expand Down Expand Up @@ -122,6 +124,7 @@ void StartHeartbeatMonitor(JobControlRecord* jcr)
if (!no_signals) {
jcr->impl->hb_bsock = NULL;
jcr->impl->hb_started = false;
jcr->impl->hb_initialized_once = false;
jcr->impl->hb_dir_bsock = NULL;
pthread_create(&jcr->impl->heartbeat_id, NULL, sd_heartbeat_thread,
(void*)jcr);
Expand All @@ -133,8 +136,9 @@ void StopHeartbeatMonitor(JobControlRecord* jcr)
{
int cnt = 0;
if (no_signals) { return; }

/* Wait max 10 secs for heartbeat thread to start */
while (!jcr->impl->hb_started && cnt++ < 200) {
while (!jcr->impl->hb_initialized_once && cnt++ < 200) {
Bmicrosleep(0, 50000); /* wait for start */
}

Expand Down Expand Up @@ -175,6 +179,8 @@ void StopHeartbeatMonitor(JobControlRecord* jcr)
// delete jcr->impl_->hb_dir_bsock;
jcr->impl->hb_dir_bsock.reset();
}

jcr->impl->hb_initialized_once = false;
}

/**
Expand All @@ -198,6 +204,8 @@ extern "C" void* dir_heartbeat_thread(void* arg)
jcr->impl->hb_bsock.reset(dir);
jcr->impl->hb_started = true;
dir->suppress_error_msgs_ = true;
jcr->impl->hb_initialized_once =
true; // initialize last to avoid race condition

while (!dir->IsStop()) {
time_t now, next;
Expand Down
7 changes: 5 additions & 2 deletions core/src/filed/jcr_private.h
Expand Up @@ -3,7 +3,7 @@
Copyright (C) 2000-2012 Free Software Foundation Europe e.V.
Copyright (C) 2011-2012 Planets Communications B.V.
Copyright (C) 2013-2019 Bareos GmbH & Co. KG
Copyright (C) 2013-2020 Bareos GmbH & Co. KG
This program is Free Software; you can redistribute it and/or
modify it under the terms of version three of the GNU Affero General Public
Expand All @@ -26,6 +26,8 @@

#include "include/bareos.h"

#include <atomic>

struct AclData;
struct XattrData;

Expand Down Expand Up @@ -68,7 +70,8 @@ struct JobControlRecordPrivate {
uint32_t StartBlock{};
uint32_t EndBlock{};
pthread_t heartbeat_id{}; /**< Id of heartbeat thread */
volatile bool hb_started{}; /**< Heartbeat running */
std::atomic<bool> hb_initialized_once{}; /**< Heartbeat initialized */
std::atomic<bool> hb_started{}; /**< Heartbeat running */
std::shared_ptr<BareosSocket> hb_bsock; /**< Duped SD socket */
std::shared_ptr<BareosSocket> hb_dir_bsock; /**< Duped DIR socket */
alist* RunScripts{}; /**< Commands to run before and after job */
Expand Down

0 comments on commit a1c51d2

Please sign in to comment.