Skip to content

Commit

Permalink
SD to SD replication makes SD crash
Browse files Browse the repository at this point in the history
Problem is that we wait up to 30 minutes for the SD-SD replication
job to start but it could be that the total migration/copy run takes
longer then 30 minutes to finish all jobs. The current code doesn't take
this into consideration and doesn't check if there is actually a remote
SD that connected for doing the replication.

So fix is to not continue when sd replication socket is not connected.
And also remove the timeout on starting a SD-SD replication session.
The normal FD-SD connection is protected with a timeout so we don't hang
when a FD never connects. As we support canceling a storage Job from the
director and the director should cleanly cancel the storage job any way
when it fails the copy or migration job it should be no problem.

In a normal backup/restore there are 3 daemons involved e.g. director,
storage daemon and file daemon but with migration the director controls
everything and controls the at maximum two storage daemons.

Fixes #276: SD to SD replication makes SD crash
  • Loading branch information
Marco van Wieringen committed Feb 17, 2015
1 parent 0349eb4 commit f1a8035
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 20 deletions.
12 changes: 8 additions & 4 deletions src/stored/dir_cmd.c
Expand Up @@ -479,7 +479,7 @@ static bool cancel_cmd(JCR *cjcr)
jcr->setJobStatus(status);

Dmsg2(800, "Cancel JobId=%d %p\n", jcr->JobId, jcr);
if (!jcr->authenticated && oldStatus == JS_WaitFD) {
if (!jcr->authenticated && (oldStatus == JS_WaitFD || oldStatus == JS_WaitSD)) {
pthread_cond_signal(&jcr->job_start_wait); /* wake waiting thread */
}

Expand All @@ -488,9 +488,13 @@ static bool cancel_cmd(JCR *cjcr)
jcr->file_bsock->set_timed_out();
Dmsg2(800, "Term bsock jid=%d %p\n", jcr->JobId, jcr);
} else {
/* Still waiting for FD to connect, release it */
pthread_cond_signal(&jcr->job_start_wait); /* wake waiting job */
Dmsg2(800, "Signal FD connect jid=%d %p\n", jcr->JobId, jcr);
if (oldStatus != JS_WaitSD) {
/*
* Still waiting for FD to connect, release it
*/
pthread_cond_signal(&jcr->job_start_wait); /* wake waiting job */
Dmsg2(800, "Signal FD connect jid=%d %p\n", jcr->JobId, jcr);
}
}

/*
Expand Down
30 changes: 14 additions & 16 deletions src/stored/sd_cmds.c
Expand Up @@ -219,37 +219,33 @@ bool do_listen_run(JCR *jcr)
{
char ec1[30];
int errstat = 0;
struct timeval tv;
struct timezone tz;
struct timespec timeout;
BSOCK *dir = jcr->dir_bsock;

jcr->sendJobStatus(JS_WaitSD); /* wait for SD to connect */

gettimeofday(&tv, &tz);
timeout.tv_nsec = tv.tv_usec * 1000;
timeout.tv_sec = tv.tv_sec + me->client_wait;

Dmsg3(50, "%s waiting %d sec for SD to contact SD key=%s\n",
jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
Dmsg2(50, "%s waiting for SD to contact SD key=%s\n", jcr->Job, jcr->sd_auth_key);
Dmsg2(800, "Wait SD for jid=%d %p\n", jcr->JobId, jcr);

/*
* Wait for the Storage daemon to contact us to start the Job,
* when he does, we will be released, unless the 30 minutes expires.
* Wait for the Storage daemon to contact us to start the Job, when he does, we will be released.
*/
P(mutex);
while (!jcr->authenticated && !job_canceled(jcr)) {
errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
errstat = pthread_cond_wait(&jcr->job_start_wait, &mutex);
if (errstat == EINVAL || errstat == EPERM) {
break;
}
Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
}
Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
job_canceled(jcr), errstat);
Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated, job_canceled(jcr), errstat);
V(mutex);
Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);

if (!jcr->authenticated || !jcr->store_bsock) {
Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
dequeue_messages(jcr); /* send any queued messages */

goto cleanup;
}

Dmsg1(120, "Start run Job=%s\n", jcr->Job);

Expand All @@ -271,9 +267,11 @@ bool do_listen_run(JCR *jcr)
do_sd_commands(jcr);

jcr->end_time = time(NULL);

dequeue_messages(jcr); /* send any queued messages */
jcr->setJobStatus(JS_Terminated);

cleanup:
generate_plugin_event(jcr, bsdEventJobEnd);

dir->fsend(Job_end, jcr->Job, jcr->JobStatus, jcr->JobFiles,
Expand Down

0 comments on commit f1a8035

Please sign in to comment.