Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix stuck query after cancel or termination when segment is not responding #948

Merged
merged 10 commits into from
May 27, 2024
3 changes: 3 additions & 0 deletions src/backend/cdb/cdbutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,9 @@ cdbcomponent_cleanupIdleQEs(bool includeWriter)
}
}

/* reset flag in libpq, that is used to avoid stuck in a loop in PQcancel*/
PQbypassConnCloseAtCancel(false);

KnightMurloc marked this conversation as resolved.
Show resolved Hide resolved
return;
}

Expand Down
2 changes: 2 additions & 0 deletions src/backend/fts/ftsprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "postmaster/fts.h"
#include "postmaster/ftsprobe.h"
#include "postmaster/postmaster.h"
#include "storage/pmsignal.h"
#include "utils/snapmgr.h"


Expand Down Expand Up @@ -1175,6 +1176,7 @@ processResponse(fts_context *context)
"triggered successfully",
primary->config->segindex, primary->config->dbid);
ftsInfo->state = FTS_RESPONSE_PROCESSED;
SendPostmasterSignal(PMSIGNAL_FTS_PROMOTED_MIRROR);
break;
case FTS_SYNCREP_OFF_SUCCESS:
elogif(gp_log_fts >= GPVARS_VERBOSITY_VERBOSE, LOG,
Expand Down
23 changes: 23 additions & 0 deletions src/backend/postmaster/postmaster.c
Original file line number Diff line number Diff line change
Expand Up @@ -5690,6 +5690,29 @@ sigusr1_handler(SIGNAL_ARGS)
signal_child(DtxRecoveryPID(), SIGINT);
}

if (CheckPostmasterSignal(PMSIGNAL_FTS_PROMOTED_MIRROR))
{
/*
* Notify all child coordinator backends that FTS has promoted a mirror.
*/
if (pmState == PM_RUN &&
GpIdentity.segindex == MASTER_CONTENT_ID)
{
dlist_iter iter;
dlist_foreach(iter, &BackendList)
{
Backend *bp = dlist_container(Backend, elem, iter.cur);

if (bp->dead_end || !(BACKEND_TYPE_NORMAL & bp->bkend_type))
continue;

SendProcSignal(bp->pid,
PROCSIG_FTS_PROMOTED_MIRROR,
InvalidBackendId);
}
}
}

/*
* Try to advance postmaster's state machine, if a child requests it.
*
Expand Down
21 changes: 21 additions & 0 deletions src/backend/storage/ipc/procsignal.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "cdb/cdbvars.h"
#include "commands/async.h"
#include "libpq-fe.h"
#include "miscadmin.h"
#include "replication/walsender.h"
#include "storage/latch.h"
Expand Down Expand Up @@ -288,6 +289,23 @@ QueryFinishHandler(void)
}
}

/*
* Coordinator postmaster signals that FTS has detected a failed segment and
* promoted the mirror.
*/
static void
FtsPromotedMirrorHandler(void)
{
/*
* In case the promotion is done during cancel or termination of a query,
* there is a very high chance that libpq will be stuck forever trying to
* wait cancel confirmation from the segment, which is not responding. Code
* below handles this case.
*/
if (QueryCancelCleanup || TermSignalReceived)
PQbypassConnCloseAtCancel(true);
}

/*
* procsignal_sigusr1_handler - handle SIGUSR1 signal.
*/
Expand Down Expand Up @@ -329,6 +347,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
if (CheckProcSignal(PROCSIG_RESOURCE_GROUP_MOVE_QUERY))
HandleMoveResourceGroup();

if (CheckProcSignal(PROCSIG_FTS_PROMOTED_MIRROR))
FtsPromotedMirrorHandler();

if (set_latch_on_sigusr1 && MyProc != NULL)
SetLatch(&MyProc->procLatch);

Expand Down
2 changes: 2 additions & 0 deletions src/include/storage/pmsignal.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ typedef enum
PMSIGNAL_WAKEN_FTS, /* wake up FTS to probe segments */
PMSIGNAL_WAKEN_DTX_RECOVERY, /* wake up dtx recovery to abort dtx xacts */

PMSIGNAL_FTS_PROMOTED_MIRROR, /* FTS has detected failed primary and promoted mirror*/

NUM_PMSIGNALS /* Must be last value of enum! */
} PMSignalReason;

Expand Down
3 changes: 3 additions & 0 deletions src/include/storage/procsignal.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ typedef enum
PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
PROCSIG_RESOURCE_GROUP_MOVE_QUERY, /* move query to a new resource group */

PROCSIG_FTS_PROMOTED_MIRROR, /* FTS has detected failed primary and promoted mirror*/

NUM_PROCSIGNALS /* Must be last! */
} ProcSignalReason;

Expand Down
10 changes: 10 additions & 0 deletions src/interfaces/libpq/fe-connect.c
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ static const PQEnvironmentOption EnvironmentOptions[] =
static const char uri_designator[] = "postgresql://";
static const char short_uri_designator[] = "postgres://";

static bool bypass_conn_close_at_cancel = false;

static bool connectOptions1(PGconn *conn, const char *conninfo);
static bool connectOptions2(PGconn *conn);
static int connectDBStart(PGconn *conn);
Expand Down Expand Up @@ -3429,6 +3431,10 @@ PQfreeCancel(PGcancel *cancel)
free(cancel);
}

void PQbypassConnCloseAtCancel(pqbool bypass)
{
bypass_conn_close_at_cancel = bypass;
}

/*
* PQcancel and PQrequestCancel: attempt to request cancellation of the
Expand Down Expand Up @@ -3523,6 +3529,10 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
*/
#ifndef WIN32
retry5:

if (bypass_conn_close_at_cancel)
goto cancel_errReturn;
RekGRpth marked this conversation as resolved.
Show resolved Hide resolved

pollFds[0].fd = tmpsock;
pollFds[0].events = POLLIN;
pollFds[0].revents = 0;
Expand Down
2 changes: 2 additions & 0 deletions src/interfaces/libpq/libpq-fe.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ extern void PQfreeCancel(PGcancel *cancel);
/* issue a cancel request */
extern int PQcancel(PGcancel *cancel, char *errbuf, int errbufsize);

extern void PQbypassConnCloseAtCancel(pqbool bypass); /* GPDB only */

/* issue a finsh request */
extern int PQrequestFinish(PGcancel *cancel, char *errbuf, int errbufsize);

Expand Down
Loading
Loading