From d3892ff3742e9479b0d108bffc7e8316a31cce53 Mon Sep 17 00:00:00 2001 From: Wen Lin Date: Sun, 17 Jan 2016 18:46:52 +0800 Subject: [PATCH 1/4] HAWQ-274. Add check for segments' temporary directories 1. add two columns in gp_segment_config, failed_tmpdir_num and failed_tmpdir; 2. segments' temporary directory information is loaded in shared memeory; 2. segment's RM process checks and reports failed tmp dir number and path in IMAlive message; 3. master's RM process updates segment's status in catalog table, if failed tmp dir number exceeds the guc values, this segment is considered as down. --- src/backend/cdb/cdbtmpdir.c | 343 +++++++++++++---- src/backend/cdb/cdbvars.c | 1 + src/backend/postmaster/identity.c | 4 +- .../communication/rmcomm_RMSEG2RM.c | 13 +- src/backend/resourcemanager/include/dynrm.h | 2 + .../resourcemanager/include/resourcepool.h | 23 +- src/backend/resourcemanager/requesthandler.c | 53 ++- .../resourcemanager/requesthandler_RMSEG.c | 73 +++- src/backend/resourcemanager/resourcemanager.c | 70 ++-- .../resourcemanager/resourcemanager_RMSEG.c | 10 +- src/backend/resourcemanager/resourcepool.c | 345 ++++++++++++++++-- src/backend/storage/ipc/ipci.c | 4 +- src/backend/utils/gp/segadmin.c | 2 + src/backend/utils/init/postinit.c | 4 +- src/backend/utils/misc/guc.c | 10 + src/include/catalog/gp_segment_config.h | 13 +- src/include/cdb/cdbtmpdir.h | 23 +- src/include/cdb/cdbvars.h | 1 + src/include/storage/lwlock.h | 3 +- tools/bin/gppylib/data/2.0.json | 16 +- 20 files changed, 829 insertions(+), 184 deletions(-) diff --git a/src/backend/cdb/cdbtmpdir.c b/src/backend/cdb/cdbtmpdir.c index 34924db6cf..687bc5f81f 100644 --- a/src/backend/cdb/cdbtmpdir.c +++ b/src/backend/cdb/cdbtmpdir.c @@ -17,113 +17,300 @@ * under the License. */ -#include "cdb/cdbtmpdir.h" #include "postgres.h" - -#include -#include - -#include "access/heapam.h" -#include "access/xact.h" -#include "catalog/catalog.h" -#include "catalog/namespace.h" -#include "catalog/pg_authid.h" -#include "catalog/pg_database.h" -#include "catalog/pg_tablespace.h" -#include "libpq/hba.h" -#include "libpq/libpq-be.h" #include "cdb/cdbvars.h" #include "cdb/cdbutil.h" -#include "mb/pg_wchar.h" +#include "cdb/cdbtmpdir.h" +#include "cdb/cdbvars.h" #include "miscadmin.h" -#include "pgstat.h" #include "postmaster/autovacuum.h" -#include "postmaster/postmaster.h" -#include "storage/backendid.h" -#include "storage/fd.h" #include "storage/ipc.h" -#include "storage/proc.h" -#include "storage/procarray.h" -#include "storage/procsignal.h" -#include "storage/sinvaladt.h" -#include "storage/smgr.h" -#include "utils/acl.h" -#include "utils/flatfiles.h" -#include "utils/guc.h" -#include "utils/relcache.h" -#include "utils/resscheduler.h" -#include "utils/syscache.h" -#include "utils/tqual.h" /* SharedSnapshot */ -#include "utils/portal.h" -#include "pgstat.h" - -static List *initTmpDirList(List *list, char *tmpdir_config); -static void destroyTmpDirList(List *list); - -List *initTmpDirList(List *list, char *tmpdir_string) +#include "storage/shmem.h" +#include + +TmpDirInfo* TmpDirInfoArray = NULL; + +static List *tmpDirList = NULL; + +int32_t TmpDirNum = 0; + +Size TmpDirInfoArraySize(void); + +void TmpDirInfoArray_ShmemInit(void); + +char* GetTmpDirPathFromArray(int64_t idx); + +bool DestroyTmpDirInfoArray(TmpDirInfo *info); + +bool CheckTmpDirAvailable(char *path); + +void destroyTmpDirList(List *list) +{ + ListCell *lc = NULL; + + foreach(lc, list) + { + char *tmpdir = (char *)lfirst(lc); + pfree(tmpdir); + } + list_free(list); +} + +static bool CheckDirValid(char* path) { - int idx = -1; - int i = 0; - char *tmpdir; - - for (i=0;i 1) + { + tmpdir = (char *)palloc0(i-idx); + strncpy(tmpdir, szTmpDir+idx+1, i-idx-1); + if(CheckDirValid(tmpdir)) + { + tmpDirNum++; + elog(LOG, "Get a temporary directory:%s", tmpdir); + tmpDirList = lappend(tmpDirList, tmpdir); + } + else + { + pfree(tmpdir); + } + } idx = i; } } - tmpdir = (char *)palloc0(i-idx); - memcpy(tmpdir, tmpdir_string+idx+1, i-idx-1); - list = lappend(list, tmpdir); - return list; + elog(LOG, "Get %d temporary directories", tmpDirNum); + return tmpDirNum; } -void destroyTmpDirList(List *list) +/* + * Calculate the size of share memory for temporary directory information + */ +Size TmpDirInfoArrayShmemSize(void) { - ListCell *lc = NULL; - - foreach(lc, list) + + if (AmIMaster()) { - char *tmpdir = (char *)lfirst(lc); - pfree(tmpdir); + TmpDirNum = GetTmpDirNumber(rm_master_tmp_dirs); } - list_free(list); + else if (AmISegment()) + { + TmpDirNum = GetTmpDirNumber(rm_seg_tmp_dirs); + } + else + { + elog(LOG, "Don't need create share memory for temporary directory information"); + TmpDirNum = 0; + } + + return MAXALIGN(TmpDirNum*sizeof(TmpDirInfo)); } -void getLocalTmpDirFromMasterConfig(int session_id) +/* + * Initialize share memory for temporary directory information + */ +void TmpDirInfoArrayShmemInit(void) { - List *tmpdirs = NULL; - - tmpdirs = initTmpDirList(tmpdirs, rm_master_tmp_dirs); - - LocalTempPath = pstrdup((char *)lfirst(list_nth_cell(tmpdirs, gp_session_id % list_length(tmpdirs)))); + bool found = false; - destroyTmpDirList(tmpdirs); + if (TmpDirNum == 0) + return; + + TmpDirInfoArray = (TmpDirInfo *)ShmemInitStruct("Temporary Directory Information Cache", + TmpDirNum*sizeof(TmpDirInfo), &found); + if(!TmpDirInfoArray) + { + elog(FATAL, + "Could not initialize Temporary Directory Information shared memory"); + } + + if(!found) + { + ListCell *lc = NULL; + int32_t i = 0; + MemSet(TmpDirInfoArray, 0, TmpDirNum*sizeof(TmpDirInfo)); + foreach(lc, tmpDirList) { + TmpDirInfoArray[i].available = true; + strncpy(TmpDirInfoArray[i].path, (char*)lfirst(lc), strlen((char*)lfirst(lc))); + i++; + } + + if (tmpDirList) + { + destroyTmpDirList(tmpDirList); + } + } + elog(LOG, "Initialize share memeory for temporary directory info finish."); +} + +/* + * Check if this temporary directory is OK to read or write. + * If not, it's probably due to disk error. + */ +bool CheckTmpDirAvailable(char *path) +{ + FILE *tmp = NULL; + bool ret = true; + char* fname = NULL; + char* testfile = "/checktmpdir.log"; + + /* write some bytes to a file to check if + * this temporary directory is OK. + */ + fname = palloc0(strlen(path) + strlen(testfile) + 1); + strncpy(fname, path, strlen(path)); + strncpy(fname + strlen(path), testfile, strlen(testfile)); + tmp = fopen(fname, "w"); + if (tmp == NULL) + { + elog(LOG, "Can't open file:%s when check temporary directory", fname); + ret = false; + goto _exit; + } + + if (fseek(tmp, 0, SEEK_SET) != 0) + { + elog(LOG, "Can't seek file:%s when check temporary directory", fname); + ret = false; + goto _exit; + } + + if (strlen("test") != fwrite("test", 1, strlen("test"), tmp)) + { + elog(LOG, "Can't write file:%s when check temporary directory", fname); + ret = false; + goto _exit; + } + +_exit: + if (fname != NULL) + pfree(fname); + if (tmp != NULL) + fclose(tmp); + return ret; +} + +/* + * Check the status of each temporary directory kept in + * shared memory, set to false if it is not available. + */ +void checkTmpDirStatus(void) +{ + LWLockAcquire(TmpDirInfoLock, LW_SHARED); + + for (int i = 0; i < TmpDirNum; i++) + { + bool oldStatus = TmpDirInfoArray[i].available; + bool newStatus = CheckTmpDirAvailable(TmpDirInfoArray[i].path); + if (oldStatus != newStatus) + { + LWLockRelease(TmpDirInfoLock); + LWLockAcquire(TmpDirInfoLock, LW_EXCLUSIVE); + TmpDirInfoArray[i].available = newStatus; + LWLockRelease(TmpDirInfoLock); + LWLockAcquire(TmpDirInfoLock, LW_SHARED); + } + } + + LWLockRelease(TmpDirInfoLock); + elog(LOG, "checkTmpDirStatus finish!"); } -void getLocalTmpDirFromSegmentConfig(int session_id, int command_id, int qeidx) +/* + * Get a list of failed temporary directory + */ +List* getFailedTmpDirList(void) { - List *tmpdirs = NULL; + List *failedList = NULL; + char *failedDir = NULL; + + LWLockAcquire(TmpDirInfoLock, LW_SHARED); + for (int i = 0; i < TmpDirNum; i++) + { + if (!TmpDirInfoArray[i].available) + { + failedDir = pstrdup(TmpDirInfoArray[i].path); + failedList = lappend(failedList, failedDir); + } + } + LWLockRelease(TmpDirInfoLock); + return failedList; +} - if (qeidx == -1) +/* + * Get a temporary directory path from array by its index + */ +char* GetTmpDirPathFromArray(int64_t idx) +{ + Insist(idx >=0 && idx <= TmpDirNum-1); + + LWLockAcquire(TmpDirInfoLock, LW_SHARED); + for (int cnt = 0; cnt < TmpDirNum; cnt++, idx++) + { + if (TmpDirInfoArray[idx].available) + { + LWLockRelease(TmpDirInfoLock); + return TmpDirInfoArray[idx].path; + } + else + { + if (idx == TmpDirNum-1) + { + /* start to look up the first element */ + idx = 0; + } + if (cnt == TmpDirNum-1) + { + /* all the temp dir are failed */ + ereport(FATAL, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("Failed to find a valid temporary directory"))); + break; + } + } + } + LWLockRelease(TmpDirInfoLock); + return NULL; +} + +void getMasterLocalTmpDirFromShmem(int session_id) +{ + LocalTempPath = GetTmpDirPathFromArray(session_id % TmpDirNum); +} + +void getSegmentLocalTmpDirFromShmem(int session_id, int command_id, int qeidx) +{ + if(qeidx == -1) { - // QE on master - getLocalTmpDirFromMasterConfig(session_id); + getMasterLocalTmpDirFromShmem(session_id); } else { - getLocalTmpDirFromMasterConfig(session_id); - - // QE on segment - tmpdirs = initTmpDirList(tmpdirs, rm_seg_tmp_dirs); - int64_t session_key = session_id; - int64_t key = (session_key << 32) + command_id + qeidx; - LocalTempPath = pstrdup((char *)lfirst(list_nth_cell(tmpdirs, key % list_length(tmpdirs)))); - destroyTmpDirList(tmpdirs); + int64_t key = (session_id << 32) + command_id + qeidx; + LocalTempPath = GetTmpDirPathFromArray(key % TmpDirNum); } } diff --git a/src/backend/cdb/cdbvars.c b/src/backend/cdb/cdbvars.c index 8f2d8c8a62..77380ef090 100644 --- a/src/backend/cdb/cdbvars.c +++ b/src/backend/cdb/cdbvars.c @@ -349,6 +349,7 @@ int rm_nocluster_timeout; /* How many seconds to wait before int rm_tolerate_nseg_limit; int rm_rejectrequest_nseg_limit; +int rm_segdown_tmpdir_limit; int rm_nvseg_variance_among_seg_limit; int rm_container_batch_limit; diff --git a/src/backend/postmaster/identity.c b/src/backend/postmaster/identity.c index 7851fa8768..898a18af7c 100644 --- a/src/backend/postmaster/identity.c +++ b/src/backend/postmaster/identity.c @@ -414,8 +414,8 @@ SetupProcessIdentity(const char *str) } else { - getLocalTmpDirFromSegmentConfig(gp_session_id, gp_command_count, GetQEIndex()); - elog(DEBUG1, "getLocalTmpDirFromSegmentConfig session_id:%d command_id:%d qeidx:%d tmpdir:%s", gp_session_id, gp_command_count, GetQEIndex(), LocalTempPath); + getSegmentLocalTmpDirFromShmem(gp_session_id, gp_command_count, GetQEIndex()); + elog(DEBUG1, "getSegmentLocalTmpDirFromShmem session_id:%d command_id:%d qeidx:%d tmpdir:%s", gp_session_id, gp_command_count, GetQEIndex(), LocalTempPath); } } diff --git a/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c b/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c index 976c896a63..aa7ed37605 100644 --- a/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c +++ b/src/backend/resourcemanager/communication/rmcomm_RMSEG2RM.c @@ -22,7 +22,7 @@ #include "communication/rmcomm_MessageHandler.h" #include "communication/rmcomm_RMSEG_RM_Protocol.h" #include "dynrm.h" - +#include "cdb/cdbtmpdir.h" #include "utils/memutilities.h" #include "utils/simplestring.h" #include "utils/linkedlist.h" @@ -107,9 +107,6 @@ int sendIMAlive(int *errorcode, int errorbufsize) { int res = FUNC_RETURN_OK; - - uint16_t dummyTempDirCount = 0; - uint16_t dummyTempDirBrokenCount = 0; AsyncCommBuffer newcommbuffer = NULL; Assert( DRMGlobalInstance->LocalHostStat != NULL ); @@ -119,8 +116,8 @@ int sendIMAlive(int *errorcode, initializeSelfMaintainBuffer(&tosend, PCONTEXT); RPCRequestHeadIMAliveData requesthead; - requesthead.TmpDirCount = dummyTempDirCount; - requesthead.TmpDirBrokenCount = dummyTempDirBrokenCount; + requesthead.TmpDirCount = TmpDirNum; + requesthead.TmpDirBrokenCount = DRMGlobalInstance->LocalHostStat->FailedTmpDirNum; requesthead.Reserved = 0; appendSMBVar(&tosend, requesthead); @@ -146,8 +143,8 @@ int sendIMAlive(int *errorcode, res = registerAsyncConnectionFileDesc(NULL, DRMGlobalInstance->SendToStandby? - standby_addr_host: - master_addr_host, + standby_addr_host: + master_addr_host, rm_master_port, ASYNCCOMM_READBYTES | ASYNCCOMM_WRITEBYTES, &AsyncCommBufferHandlersMessage, diff --git a/src/backend/resourcemanager/include/dynrm.h b/src/backend/resourcemanager/include/dynrm.h index 92ba8b8297..4c5879e655 100644 --- a/src/backend/resourcemanager/include/dynrm.h +++ b/src/backend/resourcemanager/include/dynrm.h @@ -120,6 +120,7 @@ bool handleRMRequestQuotaControl(void **arg); int refreshLocalHostInstance(void); void checkLocalPostmasterStatus(void); +void checkTmpDirStatus(void); /*----------------------------------------------------------------------------- * Dynamic resource manager overall APIs *----------------------------------------------------------------------------*/ @@ -218,6 +219,7 @@ struct DynRMGlobalData{ uint64_t LocalHostLastUpdateTime; uint64_t HeartBeatLastSentTime; + uint64_t TmpDirLastCheckTime; int32_t SegmentMemoryMB; double SegmentCore; /*------------------------------------------------------------------------*/ diff --git a/src/backend/resourcemanager/include/resourcepool.h b/src/backend/resourcemanager/include/resourcepool.h index f9b5081175..12ee5ae59b 100644 --- a/src/backend/resourcemanager/include/resourcepool.h +++ b/src/backend/resourcemanager/include/resourcepool.h @@ -67,6 +67,8 @@ struct SegInfoData { uint32_t GRMRackNameOffset; uint32_t GRMRackNameLen; uint32_t HostAddrCount; + uint32_t FailedTmpDirOffset; + uint32_t FailedTmpDirLen; uint8_t master; uint8_t standby; uint8_t alive; @@ -94,6 +96,12 @@ typedef struct SegInfoData SegInfoData; #define GET_SEGINFO_GRMHOSTNAME(seginfo) \ ((char *)(seginfo) + ((seginfo)->GRMHostNameOffset)) +/* + * Extract failed temporary string from SegInfo instance. + */ +#define GET_SEGINFO_FAILEDTMPDIR(seginfo) \ + ((char *)(seginfo) + ((seginfo)->FailedTmpDirOffset)) + /* * Macros for getting segment address content from SegInfo instance. */ @@ -135,17 +143,14 @@ void generateSegInfoReport(SegInfo seginfo, SelfMaintainBuffer buff); struct SegStatData { int32_t ID; /* Internal ID. */ + uint16_t FailedTmpDirNum; /* Failed temporary directory number */ uint8_t FTSAvailable; /* If it is available now. */ uint8_t GRMAvailable; /* If it is global resource available.*/ - uint8_t Reserved[2]; uint32_t FTSTotalMemoryMB; /* FTS reports memory capacity. */ uint32_t FTSTotalCore; /* FTS reports core capacity. */ uint32_t GRMTotalMemoryMB; /* GRM reports memory capacity. */ uint32_t GRMTotalCore; /* GRM reports core capacity. */ - - /* 64-bit aligned. */ - SegInfoData Info; /* 64-bit aligned. */ }; @@ -657,12 +662,20 @@ void checkSlavesFile(void); void cleanup_segment_config(void); /* update a segment's status in gp_segment_configuration table */ void update_segment_status(int32_t id, char status); +/* update a segment's status and failed temporary directory + * in gp_segment_configuration table + */ +void update_segment_failed_tmpdir +(int32_t id, char status, int32_t failedNum, char* failedTmpDir); /* Add a new entry into gp_segment_configuration table*/ void add_segment_config_row(int32_t id, char *hostname, char *address, uint32_t port, - char role); + char role, + char status, + uint32_t failed_tmpdir_num, + char* failed_tmpdir); /* * In resource pool, segment's id starts from 0, however in gp_segment_configuration table, diff --git a/src/backend/resourcemanager/requesthandler.c b/src/backend/resourcemanager/requesthandler.c index 2da4e4f987..fbabdcf0f8 100644 --- a/src/backend/resourcemanager/requesthandler.c +++ b/src/backend/resourcemanager/requesthandler.c @@ -592,7 +592,8 @@ bool handleRMSEGRequestIMAlive(void **arg) SelfMaintainBufferData machinereport; initializeSelfMaintainBuffer(&machinereport,PCONTEXT); SegStat segstat = (SegStat)(SMBUFF_CONTENT(&(conntrack->MessageBuff)) + - sizeof(RPCRequestHeadIMAliveData)); + sizeof(RPCRequestHeadIMAliveData)); + generateSegStatReport(segstat, &machinereport); elog(RMLOG, "Resource manager received segment machine information, %s", @@ -702,6 +703,25 @@ bool handleRMSEGRequestIMAlive(void **arg) newseginfoptr = SMBUFF_HEAD(SegInfo, &(newseginfo)); newseginfoptr->HostNameLen = strlen(fts_client_host->h_name); + appendSelfMaintainBufferTill64bitAligned(&newseginfo); + + /* fill in failed temporary directory string */ + if (fts_client_seginfo->FailedTmpDirLen != 0) + { + newseginfoptr->FailedTmpDirOffset = getSMBContentSize(&newseginfo); + newseginfoptr->FailedTmpDirLen = strlen(GET_SEGINFO_FAILEDTMPDIR(fts_client_seginfo)); + appendSMBStr(&newseginfo, GET_SEGINFO_FAILEDTMPDIR(fts_client_seginfo)); + elog(RMLOG, "Resource manager received IMAlive message, " + "failed temporary directory:%s", + GET_SEGINFO_FAILEDTMPDIR(fts_client_seginfo)); + appendSelfMaintainBufferTill64bitAligned(&newseginfo); + } + else + { + newseginfoptr->FailedTmpDirOffset = 0; + newseginfoptr->FailedTmpDirLen = 0; + } + newseginfoptr->Size = getSMBContentSize(&newseginfo); /* reported by segment, set GRM host/rack as NULL */ newseginfoptr->GRMHostNameLen = 0; @@ -729,7 +749,36 @@ bool handleRMSEGRequestIMAlive(void **arg) newsegstat->ID = SEGSTAT_ID_INVALID; newsegstat->GRMAvailable = RESOURCE_SEG_STATUS_UNSET; - newsegstat->FTSAvailable = RESOURCE_SEG_STATUS_AVAILABLE; + + RPCRequestHeadIMAlive header = SMBUFF_HEAD(RPCRequestHeadIMAlive, + &(conntrack->MessageBuff)); + newsegstat->FailedTmpDirNum = header->TmpDirBrokenCount; + + /* + * Check if the number of failed temporary directory on this segment + * exceeds the value of rm_segdown_tmpdir_limit, if exceeds, master consider + * this segment as down, even it has heart-beat report. + * + * Notes: If the number of temporary directory is not greater than + * rm_segdown_tmpdir_limit, this guc value is treated as 0. + * which means if one temporary directory is failed, this segment + * is considered as down. + */ + uint32_t failedTmpDirLimit = header->TmpDirCount <= rm_segdown_tmpdir_limit ? + 0 : rm_segdown_tmpdir_limit; + if (newsegstat->FailedTmpDirNum <= failedTmpDirLimit) + { + newsegstat->FTSAvailable = RESOURCE_SEG_STATUS_AVAILABLE; + } + else + { + elog(RMLOG, "Resource manager finds the number of failed temporary directory:%d " + "exceeds the guc rm_segdown_tmpdir_limit:%d, " + "so mark this segment unavailable.", + newsegstat->FailedTmpDirNum, + rm_segdown_tmpdir_limit); + newsegstat->FTSAvailable = RESOURCE_SEG_STATUS_UNAVAILABLE; + } bool capstatchanged = false; if ( addHAWQSegWithSegStat(newsegstat, &capstatchanged) != FUNC_RETURN_OK ) diff --git a/src/backend/resourcemanager/requesthandler_RMSEG.c b/src/backend/resourcemanager/requesthandler_RMSEG.c index c96913d3d6..66f929afd6 100644 --- a/src/backend/resourcemanager/requesthandler_RMSEG.c +++ b/src/backend/resourcemanager/requesthandler_RMSEG.c @@ -52,6 +52,9 @@ char *buildCGroupNameString(int64 masterStartTime, uint32 connId); */ int refreshLocalHostInstance(void) { + SimpString failedTmpDirStr; + initSimpleString(&failedTmpDirStr, PCONTEXT); + /* Get local host name. */ SimpString hostname; initSimpleString(&hostname, PCONTEXT); @@ -68,6 +71,33 @@ int refreshLocalHostInstance(void) addr->Address + 4); DQUEUE_LOOP_END + /* Get a list of failed temporary directory */ + List* failedTmpDir = getFailedTmpDirList(); + uint16_t failedTmpDirNum = list_length(failedTmpDir); + if (failedTmpDirNum > 0) + { + SelfMaintainBufferData buf; + initializeSelfMaintainBuffer(&buf, PCONTEXT); + uint16_t idx = 0; + ListCell *lc = NULL; + foreach(lc, failedTmpDir) + { + elog(LOG, "Get a failed temporary directory list for IMAlive message: %s", + (char *)lfirst(lc)); + appendSelfMaintainBuffer(&buf, (char *)lfirst(lc), strlen((char*)lfirst(lc))); + if (idx != failedTmpDirNum -1) + { + appendSelfMaintainBuffer(&buf, ",", 1); + } + idx++; + } + static char zeropad = '\0'; + appendSMBVar(&buf, zeropad); + setSimpleStringNoLen(&failedTmpDirStr, buf.Buffer); + elog(LOG, "Segment resource manager build failed tmp list string:%s", failedTmpDirStr.Str); + destroySelfMaintainBuffer(&buf); + } + bool shouldupdate = false; if ( DRMGlobalInstance->LocalHostStat == NULL ) { @@ -115,6 +145,27 @@ int refreshLocalHostInstance(void) } DQUEUE_LOOP_END } + + /* Check if the failed temporary directory are changed. */ + if( !shouldupdate && + DRMGlobalInstance->LocalHostStat->FailedTmpDirNum != failedTmpDirNum) + { + elog(LOG, "Segment resource manager changes number of failed " + "temporary from %d to %d.", + DRMGlobalInstance->LocalHostStat->FailedTmpDirNum, + failedTmpDirNum); + shouldupdate = true; + } + + if ( !shouldupdate && failedTmpDirNum != 0 ) + { + if (strcmp(GET_SEGINFO_FAILEDTMPDIR(info), failedTmpDirStr.Str) != 0) + { + elog(LOG, "Segment resource manager finds failed temporary directory change " + "from %s to %s", GET_SEGINFO_FAILEDTMPDIR(info), failedTmpDirStr.Str); + shouldupdate = true; + } + } } if ( shouldupdate ) @@ -134,8 +185,7 @@ int refreshLocalHostInstance(void) RMSEG_INBUILDHOST->GRMAvailable = RESOURCE_SEG_STATUS_UNSET; RMSEG_INBUILDHOST->FTSAvailable = RESOURCE_SEG_STATUS_AVAILABLE; RMSEG_INBUILDHOST->ID = SEGSTAT_ID_INVALID; - RMSEG_INBUILDHOST->Reserved[0] = 0; - RMSEG_INBUILDHOST->Reserved[1] = 0; + RMSEG_INBUILDHOST->FailedTmpDirNum = failedTmpDirNum; RMSEG_INBUILDHOST->Info.master = 0; /* I'm a segment. */ RMSEG_INBUILDHOST->Info.standby = 0; /* I'm a segment. */ @@ -182,6 +232,23 @@ int refreshLocalHostInstance(void) RMSEG_INBUILDHOST->Info.GRMRackNameLen = 0; RMSEG_INBUILDHOST->Info.GRMRackNameOffset = 0; + /* add failed temporary directory */ + if (failedTmpDirNum == 0) + { + RMSEG_INBUILDHOST->Info.FailedTmpDirOffset = 0; + RMSEG_INBUILDHOST->Info.FailedTmpDirLen = 0; + } + else + { + RMSEG_INBUILDHOST->Info.FailedTmpDirOffset = localsegstat.Cursor + 1 - + offsetof(SegStatData, Info); + appendSelfMaintainBuffer(&localsegstat, failedTmpDirStr.Str, failedTmpDirStr.Len+1); + appendSelfMaintainBufferTill64bitAligned(&localsegstat); + RMSEG_INBUILDHOST->Info.FailedTmpDirLen = failedTmpDirStr.Len; + elog(LOG, "Segment resource manager builds tmp dir:%s", + GET_SEGINFO_FAILEDTMPDIR(&RMSEG_INBUILDHOST->Info)); + } + /* get total size of this machine id. */ RMSEG_INBUILDHOST->Info.Size = localsegstat.Cursor + 1 - offsetof(SegStatData, Info); @@ -214,6 +281,8 @@ int refreshLocalHostInstance(void) DQUEUE_LOOP_END removeAllDQueueNodes(&addresses); cleanDQueue(&addresses); + destroyTmpDirList(failedTmpDir); + freeSimpleStringContent(&failedTmpDirStr); return FUNC_RETURN_OK; } diff --git a/src/backend/resourcemanager/resourcemanager.c b/src/backend/resourcemanager/resourcemanager.c index de6327db41..5aaf64a878 100644 --- a/src/backend/resourcemanager/resourcemanager.c +++ b/src/backend/resourcemanager/resourcemanager.c @@ -246,8 +246,8 @@ int ResManagerMain(int argc, char *argv[]) elog(DEBUG5, "HAWQ RM :: Passed initializing core data structure."); - /**************************************************************************/ - /* STEP 4. INIT for making RM process access catalog by CAQL etc. */ + /**************************************************************************/ + /* STEP 4. INIT for making RM process access catalog by CAQL etc. */ /**************************************************************************/ /* BLOCK signal behavior. Only another specific thread has the capability to * process the signal. */ @@ -263,45 +263,42 @@ int ResManagerMain(int argc, char *argv[]) pqsignal(SIGTTIN, SIG_IGN); pqsignal(SIGTTOU, SIG_IGN); - /* Only master side needs the access to catalog. */ - if ( DRMGlobalInstance->Role == START_RM_ROLE_MASTER ) { + CurrentResourceOwner = ResourceOwnerCreate(NULL, "Resource Manager"); - CurrentResourceOwner = ResourceOwnerCreate(NULL, "Resource Manager"); + BaseInit(); + InitProcess(); + InitBufferPoolBackend(); + InitXLOGAccess(); - BaseInit(); - InitProcess(); - InitBufferPoolBackend(); - InitXLOGAccess(); + SetProcessingMode(NormalProcessing); - SetProcessingMode(NormalProcessing); + MyDatabaseId = TemplateDbOid; + MyDatabaseTableSpace = DEFAULTTABLESPACE_OID; + if (!FindMyDatabase(probeDatabase, &MyDatabaseId, &MyDatabaseTableSpace)) + ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("database 'postgres' does not exist"))); - MyDatabaseId = TemplateDbOid; - MyDatabaseTableSpace = DEFAULTTABLESPACE_OID; - if (!FindMyDatabase(probeDatabase, &MyDatabaseId, &MyDatabaseTableSpace)) - ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), - errmsg("database 'postgres' does not exist"))); + char *fullpath = GetDatabasePath(MyDatabaseId, MyDatabaseTableSpace); - char *fullpath = GetDatabasePath(MyDatabaseId, MyDatabaseTableSpace); + SetDatabasePath(fullpath); - SetDatabasePath(fullpath); + InitProcessPhase2(); - InitProcessPhase2(); + MyBackendId = InvalidBackendId; - MyBackendId = InvalidBackendId; + SharedInvalBackendInit(false); - SharedInvalBackendInit(false); + if (MyBackendId > MaxBackends || MyBackendId <= 0) + elog(FATAL, "bad backend id: %d", MyBackendId); - if (MyBackendId > MaxBackends || MyBackendId <= 0) - elog(FATAL, "bad backend id: %d", MyBackendId); + InitBufferPoolBackend(); + RelationCacheInitialize(); + InitCatalogCache(); + RelationCacheInitializePhase2(); - InitBufferPoolBackend(); - RelationCacheInitialize(); - InitCatalogCache(); - RelationCacheInitializePhase2(); - } - /* END: INIT for making RM process access catalog by caql etc. */ - /**************************************************************************/ - PG_SETMASK(&UnBlockSig); + /* END: INIT for making RM process access catalog by caql etc. */ + /**************************************************************************/ + PG_SETMASK(&UnBlockSig); /* Save process fork mode. */ DRMGlobalInstance->ThisPID = getpid(); @@ -486,7 +483,10 @@ int ResManagerMainServer2ndPhase(void) DRMGlobalInstance->SocketLocalHostName.Str, DRMGlobalInstance->SocketLocalHostName.Str, PostPortNumber, - SEGMENT_ROLE_MASTER_CONFIG); + SEGMENT_ROLE_MASTER_CONFIG, + SEGMENT_STATUS_UP, + 0, + ""); /* Load queue and user definition as no DDL now. */ res = loadAllQueueAndUser(); @@ -813,6 +813,7 @@ int initializeDRMInstance(MCTYPE context) DRMGlobalInstance->LocalHostLastUpdateTime = 0; DRMGlobalInstance->HeartBeatLastSentTime = 0; + DRMGlobalInstance->TmpDirLastCheckTime = 0; DRMGlobalInstance->LocalHostStat = NULL; initializeDQueue(&(DRMGlobalInstance->LocalHostTempDirectoriesForQD), context); @@ -2765,12 +2766,11 @@ int loadHostInformationIntoResourcePool(void) segstat->FTSTotalCore = DRMGlobalInstance->SegmentCore; segstat->GRMTotalMemoryMB = 0; segstat->GRMTotalCore = 0; - segstat->Reserved[0] = 0; - segstat->Reserved[1] = 0; + segstat->FailedTmpDirNum = 0; memcpy((char *)segstat + offsetof(SegStatData, Info), - seginfobuff.Buffer, - seginfobuff.Cursor+1); + seginfobuff.Buffer, + seginfobuff.Cursor+1); SelfMaintainBufferData segreport; initializeSelfMaintainBuffer(&segreport,PCONTEXT); diff --git a/src/backend/resourcemanager/resourcemanager_RMSEG.c b/src/backend/resourcemanager/resourcemanager_RMSEG.c index 966bc7ce70..6b29a4017d 100644 --- a/src/backend/resourcemanager/resourcemanager_RMSEG.c +++ b/src/backend/resourcemanager/resourcemanager_RMSEG.c @@ -26,6 +26,7 @@ #include "communication/rmcomm_MessageServer.h" #include "communication/rmcomm_RMSEG2RM.h" #include "resourceenforcer/resourceenforcer.h" +#include "cdb/cdbtmpdir.h" int ResManagerMainSegment2ndPhase(void) { @@ -60,7 +61,7 @@ int ResManagerMainSegment2ndPhase(void) */ initCGroupThreads(); - InitFileAccess(); + //InitFileAccess(); /* * Notify postmaster that HAWQ RM is ready. Ignore the possible problem that @@ -151,6 +152,7 @@ int initializeSocketServer_RMSEG(void) } #define SEGMENT_HEARTBEAT_INTERVAL (3LL * 1000000LL) #define SEGMENT_HOSTCHECK_INTERVAL (5LL * 1000000LL) +#define SEGMENT_TMPDIRCHECK_INTERVAL (10 * 60LL * 1000000LL) int MainHandlerLoop_RMSEG(void) { int res = FUNC_RETURN_OK; @@ -191,6 +193,12 @@ int MainHandlerLoop_RMSEG(void) checkLocalPostmasterStatus(); } + if ( curtime - DRMGlobalInstance->TmpDirLastCheckTime > + SEGMENT_TMPDIRCHECK_INTERVAL ) { + checkTmpDirStatus(); + DRMGlobalInstance->TmpDirLastCheckTime = gettime_microsec(); + } + if ( DRMGlobalInstance->SendIMAlive ) { if (DRMGlobalInstance->LocalHostStat != NULL && curtime - DRMGlobalInstance->HeartBeatLastSentTime > diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c index 8f1d47e6c3..2e4cf0c7bf 100644 --- a/src/backend/resourcemanager/resourcepool.c +++ b/src/backend/resourcemanager/resourcepool.c @@ -542,6 +542,87 @@ void update_segment_status(int32_t id, char status) PQfinish(conn); } +/* + * update a segment's status and failed tmp dir + * in gp_segment_configuration table. + * id : registration order of this segment + * status : new status of this segment + * failedNum : number of failed temporary directory + * failedTmpDir : failed temporary directory list, separated by comma + */ +void update_segment_failed_tmpdir +(int32_t id, char status, int32_t failedNum, char* failedTmpDir) +{ + int libpqres = CONNECTION_OK; + PGconn *conn = NULL; + char conninfo[512]; + PQExpBuffer sql = NULL; + PGresult* result = NULL; + + sprintf(conninfo, "options='-c gp_session_role=UTILITY -c allow_system_table_mods=dml' " + "dbname=template1 port=%d connect_timeout=%d", master_addr_port, CONNECT_TIMEOUT); + conn = PQconnectdb(conninfo); + if ((libpqres = PQstatus(conn)) != CONNECTION_OK) + { + elog(WARNING, "Fail to connect database when update segment's failed tmpdir " + "in segment configuration catalog table, error code: %d, %s", + libpqres, + PQerrorMessage(conn)); + PQfinish(conn); + return; + } + + result = PQexec(conn, "BEGIN"); + if (!result || PQresultStatus(result) != PGRES_COMMAND_OK) + { + elog(WARNING, "Fail to run SQL: %s when update segment's failed tmpdir " + "in segment configuration catalog table, reason : %s", + "BEGIN", + PQresultErrorMessage(result)); + goto cleanup; + } + PQclear(result); + + sql = createPQExpBuffer(); + appendPQExpBuffer(sql, "UPDATE gp_segment_configuration SET " + "status='%c', failed_tmpdir_num = '%d', failed_tmpdir = '%s' " + "WHERE registration_order=%d", + status, failedNum, failedTmpDir, id); + result = PQexec(conn, sql->data); + if (!result || PQresultStatus(result) != PGRES_COMMAND_OK) + { + elog(WARNING, "Fail to run SQL: %s when update segment's failed tmpdir " + "in segment configuration catalog table, reason : %s", + sql->data, + PQresultErrorMessage(result)); + goto cleanup; + } + PQclear(result); + + result = PQexec(conn, "COMMIT"); + if (!result || PQresultStatus(result) != PGRES_COMMAND_OK) + { + elog(WARNING, "Fail to run SQL: %s when update segment's failed tmpdir " + "in segment configuration catalog table, reason : %s", + "COMMIT", + PQresultErrorMessage(result)); + goto cleanup; + } + + elog(LOG, "Update a segment's failed tmpdir:" + "status to '%c', failed_tmpdir_num to '%d', failed_tmpdir to '%s' " + "in segment configuration catalog table," + "registration_order : %d", + status, failedNum, failedTmpDir, id); + +cleanup: + if(sql) + destroyPQExpBuffer(sql); + if(result) + PQclear(result); + PQfinish(conn); +} + /* * add a row into table gp_segment_configuration using psql * id : registration order of this segment @@ -549,8 +630,19 @@ void update_segment_status(int32_t id, char status) * addreess : IP address of this segment * port : port of this segment * role : role of this segment + * status : up or down + * failed_tmpdir_num : the number of failed temporary directory + * failed_tmpdir : failed temporary directory, separated by comma */ -void add_segment_config_row(int32_t id, char* hostname, char* address, uint32_t port, char role) +void add_segment_config_row(int32_t id, + char* hostname, + char* address, + uint32_t port, + char role, + char status, + uint32_t + failed_tmpdir_num, + char* failed_tmpdir) { int libpqres = CONNECTION_OK; PGconn *conn = NULL; @@ -583,11 +675,23 @@ void add_segment_config_row(int32_t id, char* hostname, char* address, uint32_t PQclear(result); sql = createPQExpBuffer(); - appendPQExpBuffer(sql, - "INSERT INTO gp_segment_configuration(registration_order,role,status,port,hostname,address) " - "VALUES " - "(%d,'%c','%c',%d,'%s','%s')", - id,role,SEGMENT_STATUS_UP,port,hostname,address); + if (role == SEGMENT_ROLE_PRIMARY) + { + appendPQExpBuffer(sql, + "INSERT INTO gp_segment_configuration" + "(registration_order,role,status,port,hostname,address,failed_tmpdir_num,failed_tmpdir) " + "VALUES " + "(%d,'%c','%c',%d,'%s','%s',%d,'%s')", + id,role,status,port,hostname,address,failed_tmpdir_num,failed_tmpdir); + } + else + { + appendPQExpBuffer(sql, + "INSERT INTO gp_segment_configuration(registration_order,role,status,port,hostname,address) " + "VALUES " + "(%d,'%c','%c',%d,'%s','%s')", + id,role,status,port,hostname,address); + } result = PQexec(conn, sql->data); if (!result || PQresultStatus(result) != PGRES_COMMAND_OK) { @@ -610,8 +714,8 @@ void add_segment_config_row(int32_t id, char* hostname, char* address, uint32_t } elog(LOG, "Add a new row into segment configuration catalog table," - "registration order:%d, role:%c, port:%d, hostname:%s, address:%s", - id, role, port, hostname, address); + "registration order:%d, role:%c, status:%c, port:%d, hostname:%s, address:%s", + id, role, status, port, hostname, address); cleanup: if(sql) @@ -678,7 +782,8 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) /* CASE 1. It is a new host. */ if ( res != FUNC_RETURN_OK ) { - *capstatchanged = true; + uint8_t reportStatus = segstat->FTSAvailable; + /* Create machine information and corresponding resource information. */ segresource = createSegResource(segstat); @@ -741,10 +846,11 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) } /* - * Set this node HAWQ available. This is a new host, it is always set - * HAWQ available because this is from FTS heart-beat of one segment. + * This is a new host registration. Normally the status is available, + * But if the number of failed temporary directory exceeds guc, + * this segment is considered as unavailable. */ - setSegResHAWQAvailability(segresource, RESOURCE_SEG_STATUS_AVAILABLE); + setSegResHAWQAvailability(segresource, reportStatus); /* Add this node into the table gp_segment_configuration */ AddressString straddr = NULL; @@ -753,19 +859,29 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) if (Gp_role != GP_ROLE_UTILITY) { - add_segment_config_row(segid+REGISTRATION_ORDER_OFFSET, - hostname, - straddr->Address, - segresource->Stat->Info.port, - SEGMENT_ROLE_PRIMARY); + add_segment_config_row (segid+REGISTRATION_ORDER_OFFSET, + hostname, + straddr->Address, + segresource->Stat->Info.port, + SEGMENT_ROLE_PRIMARY, + segresource->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE ? + SEGMENT_STATUS_UP:SEGMENT_STATUS_DOWN, + segresource->Stat->FailedTmpDirNum, + segresource->Stat->FailedTmpDirNum == 0 ? + "":GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info)); + } + + if (segresource->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE) + { + /* Add this node into the io bytes workload BBST structure. */ + addSegResourceCombinedWorkloadIndex(segresource); + /* Add this node into the alloc/avail resource ordered indices. */ + addSegResourceAvailIndex(segresource); + addSegResourceAllocIndex(segresource); + segcapchanged = true; + *capstatchanged = true; } - /* Add this node into the io bytes workload BBST structure. */ - addSegResourceCombinedWorkloadIndex(segresource); - /* Add this node into the alloc/avail resource ordered indices. */ - addSegResourceAvailIndex(segresource); - addSegResourceAllocIndex(segresource); - segcapchanged = true; res = FUNC_RETURN_OK; } /* @@ -777,31 +893,165 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) segresource = getSegResource(segid); Assert(segresource != NULL); - if ( !IS_SEGSTAT_FTSAVAILABLE(segresource->Stat) ) + /* Check if temporary directory path is changed */ + bool tmpDirChanged = false; + if (segresource->Stat->FailedTmpDirNum != segstat->FailedTmpDirNum) { - setSegResHAWQAvailability(segresource, RESOURCE_SEG_STATUS_AVAILABLE); - if (Gp_role != GP_ROLE_UTILITY) + tmpDirChanged = true; + } + + if (!tmpDirChanged && segresource->Stat->FailedTmpDirNum != 0) + { + if (strcmp(GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info), + GET_SEGINFO_FAILEDTMPDIR(&segstat->Info)) != 0) { - update_segment_status(segresource->Stat->ID + REGISTRATION_ORDER_OFFSET, - SEGMENT_STATUS_UP); + tmpDirChanged = true; + elog(LOG, "Resource manager finds segment %s(%d) 's " + "failed temporary directory is changed from " + "'%s' to '%s'", + GET_SEGRESOURCE_HOSTNAME(segresource), + segid, + GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info), + GET_SEGINFO_FAILEDTMPDIR(&segstat->Info)); } + } - elog(LOG, "Resource manager sets segment %s(%d) up from down.", - GET_SEGRESOURCE_HOSTNAME(segresource), - segid); + /* + * Either the FTSAvailable or the failed temporary directory + * of this segment is changed. + */ + uint8_t oldStatus = segresource->Stat->FTSAvailable; + bool statusChanged = oldStatus != segstat->FTSAvailable; + if (statusChanged || tmpDirChanged) + { + if (statusChanged && !tmpDirChanged) + { + if (Gp_role != GP_ROLE_UTILITY) + { + update_segment_status(segresource->Stat->ID + REGISTRATION_ORDER_OFFSET, + segstat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE ? + SEGMENT_STATUS_UP:SEGMENT_STATUS_DOWN); + } - /* The segment is up again, its capacity should be considered again. */ - *capstatchanged = true; + /* + * Segment is set from up to down, return resource. + */ + if (oldStatus == RESOURCE_SEG_STATUS_AVAILABLE) + { + /* The segment is up again, its capacity should be considered again. */ + *capstatchanged = true; + returnAllGRMResourceFromSegment(segresource); + } + + elog(LOG, "Master resource manager sets segment %s(%d)'s status " + "to %c", + GET_SEGRESOURCE_HOSTNAME(segresource), + segid, + segstat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE ? + SEGMENT_STATUS_UP:SEGMENT_STATUS_DOWN); + } + else + { + /* + * Failed temporary directory is changed, + * if the length of new failed temporary directory exceeds the old one, + * we need to repalloc SegInfoData + */ + elog(RMLOG, "Master resource manager is gonna set segment %s(%d)'s " + "failed temporary directory from '%s' to '%s'", + GET_SEGRESOURCE_HOSTNAME(segresource), + segid, + segresource->Stat->FailedTmpDirNum == 0 ? + "" : GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info), + segstat->FailedTmpDirNum == 0 ? + "" : GET_SEGINFO_FAILEDTMPDIR(&segstat->Info)); + + int old = segresource->Stat->Info.FailedTmpDirLen == 0 ? + 0 :__SIZE_ALIGN64(segresource->Stat->Info.FailedTmpDirLen+1); + int new = segstat->Info.FailedTmpDirLen == 0 ? + 0 : __SIZE_ALIGN64(segstat->Info.FailedTmpDirLen+1); + if (new > old && + segresource->Stat->Info.Size - + (segresource->Stat->Info.HostNameOffset + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1)) + < new) + { + SegStat newSegStat = rm_repalloc(PCONTEXT, + segresource->Stat, + offsetof(SegStatData, Info) + + segresource->Stat->Info.Size + (new - old)); + segresource->Stat = newSegStat; + memset((char*)&segresource->Stat->Info + segresource->Stat->Info.Size, 0, (new - old)); + segresource->Stat->Info.Size += (new - old); + } + + if (segstat->FailedTmpDirNum != 0) + { + segresource->Stat->Info.FailedTmpDirOffset = segresource->Stat->Info.HostNameOffset + + __SIZE_ALIGN64(segresource->Stat->Info.HostNameLen+1); + memcpy((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset, + GET_SEGINFO_FAILEDTMPDIR(&segstat->Info), + strlen(GET_SEGINFO_FAILEDTMPDIR(&segstat->Info))); + memset((char *)&segresource->Stat->Info + + segresource->Stat->Info.FailedTmpDirOffset + + segstat->Info.FailedTmpDirLen, + 0, + segresource->Stat->Info.Size - + segresource->Stat->Info.FailedTmpDirOffset - + segstat->Info.FailedTmpDirLen); + } + else + { + memset((char *)&segresource->Stat->Info + segresource->Stat->Info.FailedTmpDirOffset, + 0, + segresource->Stat->Info.Size - segresource->Stat->Info.FailedTmpDirOffset); + segresource->Stat->Info.FailedTmpDirOffset = 0; + } + segresource->Stat->Info.FailedTmpDirLen = segstat->Info.FailedTmpDirLen; + segresource->Stat->FailedTmpDirNum = segstat->FailedTmpDirNum; + + setSegResHAWQAvailability(segresource, segstat->FTSAvailable); + if (Gp_role != GP_ROLE_UTILITY) + { + update_segment_failed_tmpdir(segresource->Stat->ID + REGISTRATION_ORDER_OFFSET, + segresource->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE ? + SEGMENT_STATUS_UP:SEGMENT_STATUS_DOWN, + segresource->Stat->FailedTmpDirNum, + segresource->Stat->FailedTmpDirNum == 0 ? + "" : GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info)); + } + + if (statusChanged) + { + *capstatchanged = true; + /* + * Segment is set from up to down, return resource. + */ + if (oldStatus == RESOURCE_SEG_STATUS_AVAILABLE) + { + returnAllGRMResourceFromSegment(segresource); + } + } + + elog(LOG, "Master resource manager sets segment %s(%d)'s " + "failed temporary directory to '%s', status:%c", + GET_SEGRESOURCE_HOSTNAME(segresource), + segid, + segresource->Stat->FailedTmpDirNum == 0 ? + "" : GET_SEGINFO_FAILEDTMPDIR(&segresource->Stat->Info), + segresource->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE ? + SEGMENT_STATUS_UP : SEGMENT_STATUS_DOWN); + } } /* The machine should be up. Update port number. */ segresource->Stat->Info.port = segstat->Info.port; /* Update node capacity. */ - if (((segstat->FTSTotalCore > 0) && + if ( segresource->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE && + (((segstat->FTSTotalCore > 0) && segresource->Stat->FTSTotalCore != segstat->FTSTotalCore) || ((segstat->FTSTotalMemoryMB > 0) && - segresource->Stat->FTSTotalMemoryMB != segstat->FTSTotalMemoryMB)) + segresource->Stat->FTSTotalMemoryMB != segstat->FTSTotalMemoryMB))) { uint32_t oldftsmem = segresource->Stat->FTSTotalMemoryMB; @@ -831,8 +1081,7 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) segcapchanged = oldftsmem != segresource->Stat->FTSTotalMemoryMB || oldftscore != segresource->Stat->FTSTotalCore; - - *capstatchanged = segcapchanged; + *capstatchanged = *capstatchanged ? true:segcapchanged; } /* update the status of this node */ @@ -1116,9 +1365,9 @@ SegResource createSegResource(SegStat segstat) res->NVSeg = 0; res->Stat = segstat; res->LastUpdateTime = gettime_microsec(); + res->RUAlivePending = false; res->Stat->FTSAvailable = RESOURCE_SEG_STATUS_UNSET; res->Stat->GRMAvailable = RESOURCE_SEG_STATUS_UNSET; - res->RUAlivePending = false; for ( int i = 0 ; i < RESOURCE_QUEUE_RATIO_SIZE ; ++i ) { @@ -1159,7 +1408,8 @@ int setSegResHAWQAvailability( SegResource segres, uint8_t newstatus) return res; } - if ( newstatus == RESOURCE_SEG_STATUS_UNAVAILABLE ) + if ( res == RESOURCE_SEG_STATUS_AVAILABLE && + newstatus == RESOURCE_SEG_STATUS_UNAVAILABLE ) { minusResourceBundleData(&(PRESPOOL->FTSTotal), segres->Stat->FTSTotalMemoryMB, @@ -1174,7 +1424,7 @@ int setSegResHAWQAvailability( SegResource segres, uint8_t newstatus) Assert(PRESPOOL->AvailNodeCount >= 0); setSegResRUAlivePending(segres, false); } - else + else if (newstatus == RESOURCE_SEG_STATUS_AVAILABLE) { addResourceBundleData(&(PRESPOOL->FTSTotal), segres->Stat->FTSTotalMemoryMB, @@ -1189,6 +1439,11 @@ int setSegResHAWQAvailability( SegResource segres, uint8_t newstatus) PRESPOOL->AvailNodeCount++; } } + else + { + /* Unset to unavailable, just return */ + return res; + } for ( int i = 0 ; i < PQUEMGR->RatioCount ; ++i ) { @@ -1281,6 +1536,16 @@ void generateSegInfoReport(SegInfo seginfo, SelfMaintainBuffer buff) } generateSegInfoAddrStr(seginfo, i, buff); } + + appendSelfMaintainBuffer(buff, ".", sizeof(".") - 1); + if (seginfo->FailedTmpDirLen != 0) + { + appendSelfMaintainBuffer(buff, "Failed Tmp Dir:", sizeof("Failed Tmp Dir:")-1); + appendSelfMaintainBuffer(buff, + GET_SEGINFO_FAILEDTMPDIR(seginfo), + seginfo->FailedTmpDirLen); + } + appendSMBVar(buff, zeropad); } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 90c87dc0f4..4898c73eaa 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -63,6 +63,7 @@ #include "executor/spi.h" #include "utils/workfile_mgr.h" #include "cdb/cdbmetadatacache.h" +#include "cdb/cdbtmpdir.h" #include "utils/session_state.h" shmem_startup_hook_type shmem_startup_hook = NULL; @@ -173,6 +174,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, MetadataCache_ShmemSize()); elog(LOG, "Metadata Cache Share Memory Size : %lu", MetadataCache_ShmemSize()); } + size = add_size(size, TmpDirInfoArrayShmemSize()); #ifdef FAULT_INJECTOR size = add_size(size, FaultInjector_ShmemSize()); @@ -293,7 +295,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) { MetadataCache_ShmemInit(); } - + TmpDirInfoArrayShmemInit(); if (!IsUnderPostmaster) { diff --git a/src/backend/utils/gp/segadmin.c b/src/backend/utils/gp/segadmin.c index 6a7d3d5fe6..6b49926aed 100644 --- a/src/backend/utils/gp/segadmin.c +++ b/src/backend/utils/gp/segadmin.c @@ -277,6 +277,8 @@ gp_add_master_standby(PG_FUNCTION_ARGS) values[Anum_gp_segment_configuration_port - 1] = Int32GetDatum(master->port); values[Anum_gp_segment_configuration_hostname - 1] = PG_GETARG_DATUM(0); values[Anum_gp_segment_configuration_address - 1] = PG_GETARG_DATUM(1); + nulls[Anum_gp_segment_configuration_failed_tmpdir_num - 1] = true; + nulls[Anum_gp_segment_configuration_failed_tmpdir - 1] = true; tuple = caql_form_tuple(pcqCtx, values, nulls); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index e4d0752461..8d488363ee 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -441,8 +441,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, } else { - getLocalTmpDirFromMasterConfig(gp_session_id); - elog(LOG, "getLocalTmpDirFromMasterConfig session_id:%d tmpdir:%s", gp_session_id, LocalTempPath); + getMasterLocalTmpDirFromShmem(gp_session_id); + elog(LOG, "getMasterLocalTmpDirFromShmem session_id:%d tmpdir:%s", gp_session_id, LocalTempPath); } } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b75dcb07df..de42cdc96a 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -6433,6 +6433,16 @@ static struct config_int ConfigureNamesInt[] = 2, 0, 65535, NULL, NULL }, + { + {"hawq_rm_segdown_tmpdir_limit", PGC_POSTMASTER, RESOURCES_MGM, + gettext_noop("resource manager considers a segment as down if the number of " + "failed temporary directory on this segment is greater than this limit value."), + NULL + }, + &rm_segdown_tmpdir_limit, + 2, 0, 65535, NULL, NULL + }, + { {"hawq_rm_nvseg_variance_amon_seg_limit", PGC_POSTMASTER, RESOURCES_MGM, gettext_noop("the variance of vseg number in each segment that resource manager should tolerate at most."), diff --git a/src/include/catalog/gp_segment_config.h b/src/include/catalog/gp_segment_config.h index ad489a4b73..f9f2def197 100644 --- a/src/include/catalog/gp_segment_config.h +++ b/src/include/catalog/gp_segment_config.h @@ -52,7 +52,9 @@ status "char" , port integer , hostname text , - address text + address text , + failed_tmpdir_num integer , + failed_tmpdir text ); create unique index on gp_segment_configuration(registration_order) with (indexid=6106, indexname=gp_segment_config_registration_order_index); @@ -64,7 +66,7 @@ WARNING: DO NOT MODIFY THE FOLLOWING SECTION: Generated by tidycat.pl version 34 - on Thu Feb 5 16:28:45 2015 + on Sat Jan 2 21:01:04 2016 */ @@ -93,6 +95,8 @@ CATALOG(gp_segment_configuration,5036) BKI_SHARED_RELATION BKI_WITHOUT_OIDS int4 port; text hostname; text address; + int4 failed_tmpdir_num; + text failed_tmpdir; } FormData_gp_segment_configuration; @@ -108,13 +112,16 @@ typedef FormData_gp_segment_configuration *Form_gp_segment_configuration; * compiler constants for gp_segment_configuration * ---------------- */ -#define Natts_gp_segment_configuration 6 +#define Natts_gp_segment_configuration 8 #define Anum_gp_segment_configuration_registration_order 1 #define Anum_gp_segment_configuration_role 2 #define Anum_gp_segment_configuration_status 3 #define Anum_gp_segment_configuration_port 4 #define Anum_gp_segment_configuration_hostname 5 #define Anum_gp_segment_configuration_address 6 +#define Anum_gp_segment_configuration_failed_tmpdir_num 7 +#define Anum_gp_segment_configuration_failed_tmpdir 8 + /* TIDYCAT_END_CODEGEN */ diff --git a/src/include/cdb/cdbtmpdir.h b/src/include/cdb/cdbtmpdir.h index 62d304629d..63feb8d823 100644 --- a/src/include/cdb/cdbtmpdir.h +++ b/src/include/cdb/cdbtmpdir.h @@ -19,8 +19,27 @@ #ifndef CDBTMPDIR_H #define CDBTMPDIR_H +#include "c.h" -void getLocalTmpDirFromMasterConfig(int session_id); -void getLocalTmpDirFromSegmentConfig(int session_id, int command_id, int qeidx); +#define MAX_TMP_DIR_LEN 8192 + +typedef struct TmpDirInfo +{ + bool available; + char path[MAX_TMP_DIR_LEN]; +} TmpDirInfo; + +extern int32_t TmpDirNum; + +Size TmpDirInfoArrayShmemSize(void); +void TmpDirInfoArrayShmemInit(void); +char* GetTmpDirPathFromArray(int64_t idx); +bool DestroyTmpDirInfoArray(TmpDirInfo *info); +bool CheckTmpDirAvailable(char *path); +void destroyTmpDirList(List *list); +void checkTmpDirStatus(void); + +void getMasterLocalTmpDirFromShmem(int session_id); +void getSegmentLocalTmpDirFromShmem(int session_id, int command_id, int qeidx); #endif diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index 076374c998..e383b23c0a 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -1188,6 +1188,7 @@ extern int rm_session_lease_heartbeat_interval; extern int rm_nocluster_timeout; extern int rm_tolerate_nseg_limit; extern int rm_rejectrequest_nseg_limit; +extern int rm_segdown_tmpdir_limit; extern int rm_nvseg_variance_among_seg_limit; extern int rm_container_batch_limit; extern char *rm_resourcepool_test_filename; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 9272fc7abd..b5976518ef 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -86,7 +86,8 @@ typedef enum LWLockId AOSegFileLock, ParquetSegFileLock, PersistentObjLock, - MetadataCacheLock, + MetadataCacheLock, + TmpDirInfoLock, FileRepShmemLock, FileRepAckShmemLock, FileRepAckHashShmemLock, diff --git a/tools/bin/gppylib/data/2.0.json b/tools/bin/gppylib/data/2.0.json index af8bede710..3fbd34903e 100644 --- a/tools/bin/gppylib/data/2.0.json +++ b/tools/bin/gppylib/data/2.0.json @@ -1,5 +1,5 @@ { - "__comment" : "Generated by ./tidycat.pl version 34 on Thu Sep 17 15:49:18 2015 CATALOG_VERSION_NO=201507221", + "__comment" : "Generated by tidycat.pl version 34 on Sat Jan 2 21:08:04 2016 CATALOG_VERSION_NO=201507221", "__info" : { "CATALOG_VERSION_NO" : "201507221" }, @@ -940,6 +940,8 @@ "UppercaseToastReltypeOid" : "GP_SEGMENT_CONFIGURATION_TOAST_RELTYPE_OID", "colh" : { "address" : "text", + "failed_tmpdir" : "text", + "failed_tmpdir_num" : "int4", "hostname" : "text", "port" : "int4", "registration_order" : "int4", @@ -977,6 +979,16 @@ "colname" : "address", "ctype" : "text", "sqltype" : "text" + }, + { + "colname" : "failed_tmpdir_num", + "ctype" : "int4", + "sqltype" : "integer" + }, + { + "colname" : "failed_tmpdir", + "ctype" : "text", + "sqltype" : "text" } ], "filename" : "gp_segment_config.h", @@ -1015,7 +1027,7 @@ } ], "relid_comment_tag" : "/* relation id: 5036 - gp_segment_configuration */\n", - "tabdef_text" : "\n CREATE TABLE gp_segment_configuration\n with (camelcase=GpSegmentConfig, shared=true, oid=false, relid=5036, reltype_oid=6442, toast_oid=2900, toast_index=2901, toast_reltype=2906, content=MASTER_ONLY)\n (\n registration_order integer ,\n role \"char\" ,\n status \"char\" ,\n port integer ,\n hostname text ,\n address text\n )", + "tabdef_text" : "\n CREATE TABLE gp_segment_configuration\n with (camelcase=GpSegmentConfig, shared=true, oid=false, relid=5036, reltype_oid=6442, toast_oid=2900, toast_index=2901, toast_reltype=2906, content=MASTER_ONLY)\n (\n registration_order integer ,\n role \"char\" ,\n status \"char\" ,\n port integer ,\n hostname text ,\n address text ,\n failed_tmpdir_num integer ,\n failed_tmpdir text\n )", "with" : { "bootstrap" : 0, "camelcase" : "GpSegmentConfig", From 30d373ec357a65485fdce072038a459fd9159351 Mon Sep 17 00:00:00 2001 From: Wen Lin Date: Mon, 18 Jan 2016 14:09:44 +0800 Subject: [PATCH 2/4] remove guc, one failed tmp dir happens, make it down. --- src/backend/cdb/cdbtmpdir.c | 34 +++++++------------- src/backend/cdb/cdbvars.c | 1 - src/backend/resourcemanager/requesthandler.c | 21 ++++-------- src/backend/resourcemanager/resourcepool.c | 2 +- src/backend/utils/misc/guc.c | 10 ------ src/include/cdb/cdbvars.h | 1 - 6 files changed, 18 insertions(+), 51 deletions(-) diff --git a/src/backend/cdb/cdbtmpdir.c b/src/backend/cdb/cdbtmpdir.c index 687bc5f81f..40db8836c0 100644 --- a/src/backend/cdb/cdbtmpdir.c +++ b/src/backend/cdb/cdbtmpdir.c @@ -269,31 +269,19 @@ char* GetTmpDirPathFromArray(int64_t idx) Insist(idx >=0 && idx <= TmpDirNum-1); LWLockAcquire(TmpDirInfoLock, LW_SHARED); - for (int cnt = 0; cnt < TmpDirNum; cnt++, idx++) + + if (TmpDirInfoArray[idx].available) { - if (TmpDirInfoArray[idx].available) - { - LWLockRelease(TmpDirInfoLock); - return TmpDirInfoArray[idx].path; - } - else - { - if (idx == TmpDirNum-1) - { - /* start to look up the first element */ - idx = 0; - } - if (cnt == TmpDirNum-1) - { - /* all the temp dir are failed */ - ereport(FATAL, - (errcode(ERRCODE_CDB_INTERNAL_ERROR), - errmsg("Failed to find a valid temporary directory"))); - break; - } - } + LWLockRelease(TmpDirInfoLock); + return TmpDirInfoArray[idx].path; + } + else + { + LWLockRelease(TmpDirInfoLock); + ereport(FATAL, + (errcode(ERRCODE_CDB_INTERNAL_ERROR), + errmsg("Temporary directory:%s is failed", TmpDirInfoArray[idx].path))); } - LWLockRelease(TmpDirInfoLock); return NULL; } diff --git a/src/backend/cdb/cdbvars.c b/src/backend/cdb/cdbvars.c index 77380ef090..8f2d8c8a62 100644 --- a/src/backend/cdb/cdbvars.c +++ b/src/backend/cdb/cdbvars.c @@ -349,7 +349,6 @@ int rm_nocluster_timeout; /* How many seconds to wait before int rm_tolerate_nseg_limit; int rm_rejectrequest_nseg_limit; -int rm_segdown_tmpdir_limit; int rm_nvseg_variance_among_seg_limit; int rm_container_batch_limit; diff --git a/src/backend/resourcemanager/requesthandler.c b/src/backend/resourcemanager/requesthandler.c index fbabdcf0f8..6f5eba2c5c 100644 --- a/src/backend/resourcemanager/requesthandler.c +++ b/src/backend/resourcemanager/requesthandler.c @@ -755,28 +755,19 @@ bool handleRMSEGRequestIMAlive(void **arg) newsegstat->FailedTmpDirNum = header->TmpDirBrokenCount; /* - * Check if the number of failed temporary directory on this segment - * exceeds the value of rm_segdown_tmpdir_limit, if exceeds, master consider - * this segment as down, even it has heart-beat report. - * - * Notes: If the number of temporary directory is not greater than - * rm_segdown_tmpdir_limit, this guc value is treated as 0. - * which means if one temporary directory is failed, this segment - * is considered as down. + * Check if the there is any failed temporary directory on this segment. + * if has, master considers this segment as down, even it has heart-beat report. */ - uint32_t failedTmpDirLimit = header->TmpDirCount <= rm_segdown_tmpdir_limit ? - 0 : rm_segdown_tmpdir_limit; - if (newsegstat->FailedTmpDirNum <= failedTmpDirLimit) + if (newsegstat->FailedTmpDirNum == 0) { newsegstat->FTSAvailable = RESOURCE_SEG_STATUS_AVAILABLE; } else { - elog(RMLOG, "Resource manager finds the number of failed temporary directory:%d " - "exceeds the guc rm_segdown_tmpdir_limit:%d, " + elog(RMLOG, "Resource manager finds there is %d failed temporary directories " + "on this segment, " "so mark this segment unavailable.", - newsegstat->FailedTmpDirNum, - rm_segdown_tmpdir_limit); + newsegstat->FailedTmpDirNum); newsegstat->FTSAvailable = RESOURCE_SEG_STATUS_UNAVAILABLE; } diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c index 2e4cf0c7bf..af58e41b78 100644 --- a/src/backend/resourcemanager/resourcepool.c +++ b/src/backend/resourcemanager/resourcepool.c @@ -957,7 +957,7 @@ int addHAWQSegWithSegStat(SegStat segstat, bool *capstatchanged) * if the length of new failed temporary directory exceeds the old one, * we need to repalloc SegInfoData */ - elog(RMLOG, "Master resource manager is gonna set segment %s(%d)'s " + elog(RMLOG, "Master resource manager is going to set segment %s(%d)'s " "failed temporary directory from '%s' to '%s'", GET_SEGRESOURCE_HOSTNAME(segresource), segid, diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index de42cdc96a..b75dcb07df 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -6433,16 +6433,6 @@ static struct config_int ConfigureNamesInt[] = 2, 0, 65535, NULL, NULL }, - { - {"hawq_rm_segdown_tmpdir_limit", PGC_POSTMASTER, RESOURCES_MGM, - gettext_noop("resource manager considers a segment as down if the number of " - "failed temporary directory on this segment is greater than this limit value."), - NULL - }, - &rm_segdown_tmpdir_limit, - 2, 0, 65535, NULL, NULL - }, - { {"hawq_rm_nvseg_variance_amon_seg_limit", PGC_POSTMASTER, RESOURCES_MGM, gettext_noop("the variance of vseg number in each segment that resource manager should tolerate at most."), diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index e383b23c0a..076374c998 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -1188,7 +1188,6 @@ extern int rm_session_lease_heartbeat_interval; extern int rm_nocluster_timeout; extern int rm_tolerate_nseg_limit; extern int rm_rejectrequest_nseg_limit; -extern int rm_segdown_tmpdir_limit; extern int rm_nvseg_variance_among_seg_limit; extern int rm_container_batch_limit; extern char *rm_resourcepool_test_filename; From 5639b006f489cb8d1c2217415f818da10d2bdfb0 Mon Sep 17 00:00:00 2001 From: Wen Lin Date: Mon, 18 Jan 2016 15:49:54 +0800 Subject: [PATCH 3/4] fix bug --- .../resourcemanager/include/resourcepool.h | 2 ++ src/backend/resourcemanager/resourcepool.c | 24 +++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/backend/resourcemanager/include/resourcepool.h b/src/backend/resourcemanager/include/resourcepool.h index 12ee5ae59b..539735d6d6 100644 --- a/src/backend/resourcemanager/include/resourcepool.h +++ b/src/backend/resourcemanager/include/resourcepool.h @@ -633,6 +633,8 @@ int getOrderedResourceAllocTreeIndexByRatio(uint32_t ratio, BBST *tree); void setAllSegResourceGRMUnavailable(void); +int getAllSegResourceFTSAvailableNumber(void); + struct RB_GRMContainerStatData { int64_t ContainerID; diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c index af58e41b78..83f10ef64d 100644 --- a/src/backend/resourcemanager/resourcepool.c +++ b/src/backend/resourcemanager/resourcepool.c @@ -1310,6 +1310,25 @@ void setAllSegResourceGRMUnavailable(void) freePAIRRefList(&(PRESPOOL->Segments), &allsegres); } +int getAllSegResourceFTSAvailableNumber(void) +{ + int cnt = 0; + List *allsegres = NULL; + ListCell *cell = NULL; + getAllPAIRRefIntoList(&(PRESPOOL->Segments), &allsegres); + + foreach(cell, allsegres) + { + SegResource segres = (SegResource)(((PAIR)lfirst(cell))->Value); + if (segres->Stat->FTSAvailable == RESOURCE_SEG_STATUS_AVAILABLE) + { + cnt++; + } + } + freePAIRRefList(&(PRESPOOL->Segments), &allsegres); + return cnt; +} + /* * Check index to get host id based on host name string. */ @@ -3889,12 +3908,13 @@ void validateResourcePoolStatus(bool refquemgr) Assert( availtree != NULL ); traverseBBSTMidOrder(availtree, &line); - if ( line.NodeCount != PRESPOOL->Segments.NodeCount ) + int availableCnt = getAllSegResourceFTSAvailableNumber(); + if ( line.NodeCount != availableCnt ) { elog(ERROR, "HAWQ RM Validation. The available resource ordered index " "contains %d nodes, expect %d nodes.", line.NodeCount, - PRESPOOL->Segments.NodeCount); + availableCnt); } SegResource prevres = NULL; From d454cb3045e309001f59a18497fd52bb147fe8cb Mon Sep 17 00:00:00 2001 From: Wen Lin Date: Mon, 18 Jan 2016 16:11:20 +0800 Subject: [PATCH 4/4] fix bug --- src/backend/resourcemanager/resourcepool.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/resourcemanager/resourcepool.c b/src/backend/resourcemanager/resourcepool.c index 83f10ef64d..07bd6551ae 100644 --- a/src/backend/resourcemanager/resourcepool.c +++ b/src/backend/resourcemanager/resourcepool.c @@ -3963,12 +3963,13 @@ void validateResourcePoolStatus(bool refquemgr) Assert( alloctree != NULL ); traverseBBSTMidOrder(alloctree, &line); - if ( line.NodeCount != PRESPOOL->Segments.NodeCount ) + int availableCnt = getAllSegResourceFTSAvailableNumber(); + if ( line.NodeCount != availableCnt ) { elog(ERROR, "HAWQ RM Validation. The allocated resource ordered index " "contains %d nodes, expect %d nodes.", line.NodeCount, - PRESPOOL->Segments.NodeCount); + availableCnt); } SegResource prevres = NULL;