From 7a7fd3c58d52ecf7cfcd6fb172bece2f86802649 Mon Sep 17 00:00:00 2001 From: Mu2E Date: Thu, 8 Dec 2022 10:40:20 -0600 Subject: [PATCH] Fixed transition Configured -> Halted in the ots State Machine --- .../CorePropertySupervisorBase.cc | 11 ++ otsdaq/CoreSupervisors/FESupervisor.cc | 10 +- otsdaq/FECore/FEVInterface.cc | 2 + otsdaq/FECore/FEVInterfacesManager.cc | 20 ++-- .../RunControlStateMachine.cc | 13 ++- .../RunControlStateMachine.h | 3 + otsdaq/GatewaySupervisor/GatewaySupervisor.cc | 43 +++++-- otsdaq/Macros/StringMacros.cc | 2 +- otsdaq/SOAPUtilities/SOAPMessenger.cc | 15 ++- otsdaq/SupervisorInfo/AllSupervisorInfo.cc | 9 +- otsdaq/SupervisorInfo/AllSupervisorInfo.h | 2 +- .../MessageFacilityTable_table.cc | 2 +- tools/common.sh | 12 +- tools/ots | 105 ++++++++++++++---- tools/ots_remote_start | 2 +- 15 files changed, 196 insertions(+), 55 deletions(-) diff --git a/otsdaq/CoreSupervisors/CorePropertySupervisorBase.cc b/otsdaq/CoreSupervisors/CorePropertySupervisorBase.cc index dcd19f7b..707e32a0 100644 --- a/otsdaq/CoreSupervisors/CorePropertySupervisorBase.cc +++ b/otsdaq/CoreSupervisors/CorePropertySupervisorBase.cc @@ -107,6 +107,17 @@ CorePropertySupervisorBase::CorePropertySupervisorBase(xdaq::Application* applic __SUP_COUTV__(CorePropertySupervisorBase::supervisorApplicationUID_); __SUP_COUTV__(CorePropertySupervisorBase::supervisorConfigurationPath_); + //try to verify binding port for context was established + //All this code failed to do the trick + // { + // application->ptr_; + + // PeerTransportHTTP(this) + // const xdaq::NetGroup* netGroupPtr = application->getApplicationContext()->getNetGroup(); + // auto netVector = netGroupPtr->getNetworks(); + // __SUP_COUTV__(netVector.size()); + // } + CorePropertySupervisorBase::indicateOtsAlive(this); theConfigurationManager_->setOwnerContext(CorePropertySupervisorBase::supervisorContextUID_); diff --git a/otsdaq/CoreSupervisors/FESupervisor.cc b/otsdaq/CoreSupervisors/FESupervisor.cc index d7543257..bc26453b 100644 --- a/otsdaq/CoreSupervisors/FESupervisor.cc +++ b/otsdaq/CoreSupervisors/FESupervisor.cc @@ -958,18 +958,23 @@ void FESupervisor::transitionConfiguring(toolbox::Event::Reference /*event*/) void FESupervisor::transitionHalting(toolbox::Event::Reference event) { __SUP_COUT__ << "transitionHalting" << __E__; -TLOG_DEBUG(7) << "transitionHalting"; + TLOG_DEBUG(7) << "transitionHalting"; + + //shutdown workloops first, then shutdown metric manager + CoreSupervisorBase::transitionHalting(event); + try { if(metricMan && metricMan->Initialized()) { TLOG_DEBUG(7) << "Metric manager(" << metricMan << ") shutting down..." << __E__; metricMan->shutdown(); // will set initilized_ to false with mutex, which should prevent races - metricMan.reset(nullptr); TLOG_DEBUG(7) << "Metric manager shutdown." << __E__; } else __SUP_COUT__ << "Metric manager(" << metricMan << ") already shutdown." << __E__; + + metricMan.reset(nullptr); } catch(...) { @@ -984,6 +989,5 @@ TLOG_DEBUG(7) << "transitionHalting"; ); } - CoreSupervisorBase::transitionHalting(event); __SUP_COUT__ << "transitionHalting done." << __E__; } // end transitionHalting() diff --git a/otsdaq/FECore/FEVInterface.cc b/otsdaq/FECore/FEVInterface.cc index ae1289ca..5517e61a 100644 --- a/otsdaq/FECore/FEVInterface.cc +++ b/otsdaq/FECore/FEVInterface.cc @@ -42,6 +42,8 @@ FEVInterface::~FEVInterface(void) //============================================================================== void FEVInterface::configureSlowControls(void) { + __COUT__ << "configureSlowControls" << __E__; + // Start artdaq metric manager here, if possible if(metricMan && !metricMan->Running() && metricMan->Initialized()) { diff --git a/otsdaq/FECore/FEVInterfacesManager.cc b/otsdaq/FECore/FEVInterfacesManager.cc index 72edf439..f3e7336f 100644 --- a/otsdaq/FECore/FEVInterfacesManager.cc +++ b/otsdaq/FECore/FEVInterfacesManager.cc @@ -195,14 +195,18 @@ void FEVInterfacesManager::configure(void) fe->configure(); postStateMachineExecution(i); - // configure slow controls and start slow controls workloop - // slow controls workloop stays alive through start/stop.. and dies on halt - fe->configureSlowControls(); - fe->startSlowControlsWorkLoop(); - - __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; - __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; - __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; + //when done with fe configure, configure slow controls + if(!fe->VStateMachine::getSubIterationWork() && !fe->VStateMachine::getIterationWork()) + { + // configure slow controls and start slow controls workloop + // slow controls workloop stays alive through start/stop.. and dies on halt + fe->configureSlowControls(); + fe->startSlowControlsWorkLoop(); + + __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; + __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; + __CFG_COUT__ << "Done " << transitionName << " interface " << name << __E__; + } } postStateMachineExecutionLoop(); diff --git a/otsdaq/FiniteStateMachine/RunControlStateMachine.cc b/otsdaq/FiniteStateMachine/RunControlStateMachine.cc index f1eb6043..298b5d45 100644 --- a/otsdaq/FiniteStateMachine/RunControlStateMachine.cc +++ b/otsdaq/FiniteStateMachine/RunControlStateMachine.cc @@ -24,6 +24,9 @@ const std::string RunControlStateMachine::HALTED_STATE_NAME = "Halted"; const std::string RunControlStateMachine::PAUSED_STATE_NAME = "Paused"; const std::string RunControlStateMachine::RUNNING_STATE_NAME = "Running"; +const std::string RunControlStateMachine::SHUTDOWN_TRANSITION_NAME = "Shutdown"; +const std::string RunControlStateMachine::STARTUP_TRANSITION_NAME = "Startup"; + //============================================================================== RunControlStateMachine::RunControlStateMachine(const std::string& name) : theStateMachine_(name), asyncFailureReceived_(false), asyncPauseExceptionReceived_(false), asyncStopExceptionReceived_(false) @@ -51,14 +54,14 @@ RunControlStateMachine::RunControlStateMachine(const std::string& name) //clang-format off // this line was added to get out of Failed state RunControlStateMachine::addStateTransition('F', 'H', "Halt", "Halting", this, &RunControlStateMachine::transitionHalting); - RunControlStateMachine::addStateTransition('F', 'X', "Shutdown", "Shutting Down", this, &RunControlStateMachine::transitionShuttingDown); + RunControlStateMachine::addStateTransition('F', 'X', RunControlStateMachine::SHUTDOWN_TRANSITION_NAME, "Shutting Down", this, &RunControlStateMachine::transitionShuttingDown); RunControlStateMachine::addStateTransition('F', 'F', "Error", "Erroring", this, &RunControlStateMachine::transitionShuttingDown); RunControlStateMachine::addStateTransition('F', 'F', "Fail", "Failing", this, &RunControlStateMachine::transitionShuttingDown); RunControlStateMachine::addStateTransition( 'H', 'C', "Configure", "Configuring", "ConfigurationAlias", this, &RunControlStateMachine::transitionConfiguring); - RunControlStateMachine::addStateTransition('H', 'X', "Shutdown", "Shutting Down", this, &RunControlStateMachine::transitionShuttingDown); - RunControlStateMachine::addStateTransition('X', 'I', "Startup", "Starting Up", this, &RunControlStateMachine::transitionStartingUp); + RunControlStateMachine::addStateTransition('H', 'X', RunControlStateMachine::SHUTDOWN_TRANSITION_NAME, "Shutting Down", this, &RunControlStateMachine::transitionShuttingDown); + RunControlStateMachine::addStateTransition('X', 'I', RunControlStateMachine::STARTUP_TRANSITION_NAME, "Starting Up", this, &RunControlStateMachine::transitionStartingUp); // Every state can transition to halted RunControlStateMachine::addStateTransition('I', 'H', "Initialize", "Initializing", this, &RunControlStateMachine::transitionInitializing); @@ -84,8 +87,8 @@ RunControlStateMachine::RunControlStateMachine(const std::string& name) xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Resume", XDAQ_NS_URI); xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Halt", XDAQ_NS_URI); xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Abort", XDAQ_NS_URI); - xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Shutdown", XDAQ_NS_URI); - xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Startup", XDAQ_NS_URI); + xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, RunControlStateMachine::SHUTDOWN_TRANSITION_NAME, XDAQ_NS_URI); + xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, RunControlStateMachine::STARTUP_TRANSITION_NAME, XDAQ_NS_URI); xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Fail", XDAQ_NS_URI); xoap::bind(this, &RunControlStateMachine::runControlMessageHandler, "Error", XDAQ_NS_URI); diff --git a/otsdaq/FiniteStateMachine/RunControlStateMachine.h b/otsdaq/FiniteStateMachine/RunControlStateMachine.h index 9c98e535..96c99fb3 100644 --- a/otsdaq/FiniteStateMachine/RunControlStateMachine.h +++ b/otsdaq/FiniteStateMachine/RunControlStateMachine.h @@ -117,6 +117,9 @@ class RunControlStateMachine : public virtual toolbox::lang::Class static const std::string PAUSED_STATE_NAME; static const std::string RUNNING_STATE_NAME; + static const std::string SHUTDOWN_TRANSITION_NAME; + static const std::string STARTUP_TRANSITION_NAME; + unsigned int getIterationIndex(void) { return iterationIndex_; } unsigned int getSubIterationIndex(void) { return subIterationIndex_; } void indicateIterationWork(void) { iterationWorkFlag_ = true; } diff --git a/otsdaq/GatewaySupervisor/GatewaySupervisor.cc b/otsdaq/GatewaySupervisor/GatewaySupervisor.cc index b2e098a9..8a8c6326 100644 --- a/otsdaq/GatewaySupervisor/GatewaySupervisor.cc +++ b/otsdaq/GatewaySupervisor/GatewaySupervisor.cc @@ -198,6 +198,7 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor) // sleep // __COUT__ << "Just debugging App status checking" << __E__; + bool oneStatusReqHasFailed = false; for(const auto& it : theSupervisor->allSupervisorInfo_.getAllSupervisorInfo()) { auto appInfo = it.second; @@ -291,19 +292,33 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor) } catch(const xdaq::exception::Exception& e) { - //__COUT__ << "Failed to send getStatus SOAP Message: " << e.what() << __E__; + __COUT__ << "Getting Status " + << " Supervisor instance = '" << appInfo.getName() + << "' [LID=" << appInfo.getId() << "] in Context '" + << appInfo.getContextName() << "' [URL=" << + appInfo.getURL() + << "].\n\n"; + __COUTV__(SOAPUtilities::translate(tempMessage)); + __COUT_WARN__ << "Failed to send getStatus SOAP Message: " << e.what() << __E__; status = SupervisorInfo::APP_STATUS_UNKNOWN; progress = "0"; detail = "SOAP Message Error"; - sleep(5); // sleep to not overwhelm server with errors + oneStatusReqHasFailed = true; } catch(...) { - //__COUT_WARN__ << "Failed to send getStatus SOAP Message due to unknown error." << __E__; + __COUT__ << "Getting Status " + << " Supervisor instance = '" << appInfo.getName() + << "' [LID=" << appInfo.getId() << "] in Context '" + << appInfo.getContextName() << "' [URL=" << + appInfo.getURL() + << "].\n\n"; + __COUTV__(SOAPUtilities::translate(tempMessage)); + __COUT_WARN__ << "Failed to send getStatus SOAP Message due to unknown error." << __E__; status = SupervisorInfo::APP_STATUS_UNKNOWN; progress = "0"; detail = "Unknown SOAP Message Error"; - sleep(5); // sleep to not overwhelm server with errors + oneStatusReqHasFailed = true; } } // end with non-gateway status request handling @@ -319,6 +334,8 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor) theSupervisor->allSupervisorInfo_.setSupervisorStatus(appInfo, status, progressInteger, detail); } // end of app loop + if(oneStatusReqHasFailed) + sleep(5); // sleep to not overwhelm server with errors } // end of infinite status checking loop } // end AppStatusWorkLoop @@ -1734,7 +1751,8 @@ catch(...) void GatewaySupervisor::transitionShuttingDown(toolbox::Event::Reference /*e*/) try { - __COUT__ << "Fsm current state: " << theStateMachine_.getCurrentStateName() << __E__; + __COUT__ << "Fsm current state: " << theStateMachine_.getCurrentStateName() << + " message: " << theStateMachine_.getCurrentStateName() << __E__; RunControlStateMachine::theProgressBar_.step(); makeSystemLogEntry("System shutting down."); @@ -1751,6 +1769,9 @@ try sleep(1); RunControlStateMachine::theProgressBar_.step(); } + + broadcastMessage(theStateMachine_.getCurrentMessage()); + } // end transitionShuttingDown() catch(const xdaq::exception::Exception& e) // due to xoap send failure { @@ -1803,6 +1824,8 @@ try RunControlStateMachine::theProgressBar_.step(); } + broadcastMessage(theStateMachine_.getCurrentMessage()); + } // end transitionStartingUp() catch(const xdaq::exception::Exception& e) // due to xoap send failure { @@ -2257,6 +2280,8 @@ bool GatewaySupervisor::handleBroadcastMessageTarget(const SupervisorInfo& appI RunControlStateMachine::theProgressBar_.step(); std::string givenAppStatus = theStateMachine_.getCurrentTransitionName(command); + __COUTV__(givenAppStatus.capacity()); + unsigned int givenAppProgress = appInfo.getProgress(); std::string givenAppDetail = appInfo.getDetail(); if(givenAppProgress >= 100) @@ -2317,6 +2342,7 @@ bool GatewaySupervisor::handleBroadcastMessageTarget(const SupervisorInfo& appI // for transition attempt, set status for app, in case the request occupies the target app reply = send(appInfo.getDescriptor(), message); // then release mutex here using scope change, to allow the app to start giving its own updates + __COUTV__(givenAppStatus.capacity()); } catch(const xdaq::exception::Exception& e) // due to xoap send failure { @@ -2553,7 +2579,10 @@ void GatewaySupervisor::broadcastMessage(xoap::MessageReference message) try { - orderedSupervisors = allSupervisorInfo_.getOrderedSupervisorDescriptors(command); + orderedSupervisors = allSupervisorInfo_.getOrderedSupervisorDescriptors(command, + //only gateway apps for special shutdown and startup command broadcast + command == RunControlStateMachine::SHUTDOWN_TRANSITION_NAME || + command == RunControlStateMachine::STARTUP_TRANSITION_NAME); } catch(const std::runtime_error& e) { @@ -3919,7 +3948,7 @@ void GatewaySupervisor::launchStartOTSCommand(const std::string& command, Config if(context.address_[i] == '/') j = i + 1; hostnames.push_back(context.address_.substr(j)); - __COUT__ << "StartOTS.sh hostname = " << hostnames.back() << __E__; + __COUT__ << "StartOTS.sh command '" << command << "' launching on hostname = " << hostnames.back() << __E__; } } catch(...) diff --git a/otsdaq/Macros/StringMacros.cc b/otsdaq/Macros/StringMacros.cc index 0ac13e5c..8ee7ba3d 100644 --- a/otsdaq/Macros/StringMacros.cc +++ b/otsdaq/Macros/StringMacros.cc @@ -1271,7 +1271,7 @@ char* StringMacros::otsGetEnvironmentVarable(const char* name, const std::string { __SS__ << "Environment variable '" << name << "' not defined at " << location << "[" << line << "]" << __E__; ss << "\n\n" << StringMacros::stackTrace() << __E__; - __SS_THROW__; + __SS_ONLY_THROW__; } return environmentVariablePtr; } // end otsGetEnvironmentVarable() diff --git a/otsdaq/SOAPUtilities/SOAPMessenger.cc b/otsdaq/SOAPUtilities/SOAPMessenger.cc index 5157b81c..6e4fdfed 100644 --- a/otsdaq/SOAPUtilities/SOAPMessenger.cc +++ b/otsdaq/SOAPUtilities/SOAPMessenger.cc @@ -51,7 +51,20 @@ std::string SOAPMessenger::send(XDAQ_CONST_CALL xdaq::ApplicationDescriptor* d, { xoap::MessageReference message = SOAPUtilities::makeSOAPMessageReference(command); - return send(d, message); + std::string msgStr; + try + { + msgStr = send(d, message); + } + catch(...) + { + __COUT__ << "send failed?!" << __E__; + __COUT__ << SOAPUtilities::translate(message) << __E__; + throw; + } + + + return msgStr; } //============================================================================== diff --git a/otsdaq/SupervisorInfo/AllSupervisorInfo.cc b/otsdaq/SupervisorInfo/AllSupervisorInfo.cc index b88f841c..1ae80961 100644 --- a/otsdaq/SupervisorInfo/AllSupervisorInfo.cc +++ b/otsdaq/SupervisorInfo/AllSupervisorInfo.cc @@ -352,9 +352,9 @@ const SupervisorInfo& AllSupervisorInfo::getArtdaqSupervisorInfo(void) const } // end getArtdaqSupervisorInfo() //============================================================================== -std::vector> AllSupervisorInfo::getOrderedSupervisorDescriptors(const std::string& stateMachineCommand) const +std::vector> AllSupervisorInfo::getOrderedSupervisorDescriptors(const std::string& stateMachineCommand, bool onlyGatewayContextSupervisors) const { - __COUT__ << "getOrderedSupervisorDescriptors" << __E__; + __COUT__ << "getOrderedSupervisorDescriptors for command " << stateMachineCommand << __E__; std::map> orderedByPriority; @@ -417,6 +417,11 @@ std::vector> AllSupervisorInfo::getOrderedSup // priority? " << (unsigned int)priorityAppVector.first << //__E__; + + if(onlyGatewayContextSupervisors && + it->second.getContextName() != theSupervisorInfo_->getContextName()) + continue; //for shutdown and startup only broadcast to apps that are local to the Gateway supervisor + if(it->second.isGatewaySupervisor()) continue; // skip gateway supervisor if(it->second.isTypeLogbookSupervisor()) diff --git a/otsdaq/SupervisorInfo/AllSupervisorInfo.h b/otsdaq/SupervisorInfo/AllSupervisorInfo.h index e88141a4..d6c61bff 100644 --- a/otsdaq/SupervisorInfo/AllSupervisorInfo.h +++ b/otsdaq/SupervisorInfo/AllSupervisorInfo.h @@ -61,7 +61,7 @@ class AllSupervisorInfo : public SupervisorDescriptorInfoBase XDAQ_CONST_CALL xdaq::ApplicationDescriptor* getWizardDescriptor (void) const; const SupervisorInfo& getArtdaqSupervisorInfo (void) const; - std::vector> getOrderedSupervisorDescriptors (const std::string& stateMachineCommand) const; + std::vector> getOrderedSupervisorDescriptors (const std::string& stateMachineCommand, bool onlyGatewayContextSupervisors = false) const; std::recursive_mutex& getSupervisorInfoMutex (unsigned int lid) { return allSupervisorInfoMutex_[lid]; } private: SupervisorInfo* theSupervisorInfo_; diff --git a/otsdaq/TablePlugins/MessageFacilityTable_table.cc b/otsdaq/TablePlugins/MessageFacilityTable_table.cc index acf8a5e5..1d3aa840 100644 --- a/otsdaq/TablePlugins/MessageFacilityTable_table.cc +++ b/otsdaq/TablePlugins/MessageFacilityTable_table.cc @@ -117,7 +117,7 @@ void MessageFacilityTable::init(ConfigurationManager* configManager) child.second.getNode(COL_QT_PORT).getValue(fwdPort); } - __COUT__ << "Foud FWD/WEB/QT " << (COL_ENABLE_FWD ? "true" : "false") << "/" << (COL_USE_WEB ? "true" : "false") << "/" + __COUT__ << "Found FWD/WEB/QT " << (COL_ENABLE_FWD ? "true" : "false") << "/" << (COL_USE_WEB ? "true" : "false") << "/" << (COL_USE_QT ? "true" : "false") << " and IP:Port:FwdPort " << fwdIP << ":" << fwdPort << ":" << destFwdPort << " in MesageFacility table." << __E__; break; // take first enable row only! diff --git a/tools/common.sh b/tools/common.sh index fc1a9911..71bad087 100644 --- a/tools/common.sh +++ b/tools/common.sh @@ -89,10 +89,10 @@ defineColors () defineColors SCRIPT_NAME=$1 -out() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}$@${RstClr}"; } -info() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IBlue}$@${RstClr}"; } -success() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IGreen}$@${RstClr}"; } -error() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IRed}$@${RstClr}"; } >&2 -warning() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IYellow}$@${RstClr}"; } >&2 -die() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IRed}$@${RstClr}"; exit 1; } +out() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}$@${RstClr}"; } +info() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IBlue}$@${RstClr}"; } +success() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IGreen}$@${RstClr}"; } +error() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IRed}$@${RstClr}"; } >&2 +warning() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IYellow}$@${RstClr}"; } >&2 +die() { echo -e "${RstClr}${IRed}${STARTTIME}${RstClr}-${Green}$(date +'%d%h%y.%T') ${IBlue}${HOSTNAME}${RstClr} ${SCRIPT_NAME} [${Cyan}${BASH_LINENO[0]}${RstClr}]${IBlack}\t${RstClr}${IRed}$@${RstClr}"; exit 1; } diff --git a/tools/ots b/tools/ots index 9d4a21e6..daac4661 100644 --- a/tools/ots +++ b/tools/ots @@ -53,7 +53,7 @@ HOSTNAME="$(hostname -f)" THIS_HOST="${HOSTNAME}" HOSTNAME_ARR=($(echo "${THIS_HOST}" | tr '.' " ")) -STARTTIME=`date +"%h%y.%T"` #to fully ID one StartOTS from another +STARTTIME=`date +"%d%h%y.%T"` #to fully ID one StartOTS from another #export products, used by artdaq daqInterface export OTS_PRODUCTS=`echo $PRODUCTS|tr ':' '\n'|grep -v localProducts|head -1` @@ -348,8 +348,6 @@ export -f killproc ############################# # function to kill all things ots -# TODO (Iris): Using ps and grep isn't a safe way to try to find processes, we should handle shutdown and cleanup -# in some kind of management process/script that cleans up properly function killprocs { @@ -379,6 +377,8 @@ function killprocs for key in `ipcs|grep $usershort|grep ' 0 '|awk '{print $1}'`;do ipcrm -M $key;done sleep 1 #give time for cleanup to occur + + else #then killing only non-gateway contexts for contextPID in "${ContextPIDArray[@]}" @@ -386,7 +386,10 @@ function killprocs if [ $ISREMOTE == 0 ]; then out "${Red}${Rev}Killing Non-gateway process ID ${contextPID}${RstClr}" + else + out "${Red}${Rev}Killing Non-gateway process ID ${contextPID}${RstClr}" fi + killproc ${contextPID} & @@ -397,8 +400,7 @@ function killprocs sleep 1 #give time for cleanup to occur fi - #now hard kill any processes that may be stuck and detached: - + #now hard kill any processes that may be stuck and detached: killall -9 art &>/dev/null 2>&1 #hide output killall -9 boardreader &>/dev/null 2>&1 #hide output killall -9 eventbuilder &>/dev/null 2>&1 #hide output @@ -406,9 +408,12 @@ function killprocs killall -9 dispatcher &>/dev/null 2>&1 #hide output killall -9 routing_manager &>/dev/null 2>&1 #hide output ipcrm -a &>/dev/null 2>&1 #hide output #clean-up shared memory - killall -9 xdaq.exe &>/dev/null 2>&1 #hide output - killall -9 otsConsoleFwd &>/dev/null 2>&1 #hide output - + + if [[ "x$1" == "x" ]]; then #kill all ots processes + killall -9 xdaq.exe &>/dev/null 2>&1 #hide output + killall -9 otsConsoleFwd &>/dev/null 2>&1 #hide output + fi + if [ $ISREMOTE == 0 ]; then out "Done killing processes $1 ${RstClr}" fi @@ -1416,18 +1421,31 @@ launchOTS() { #test for relaunch # 4 times quietly, then get louder + sleep 2 OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) #out "OTS_IS_ALIVE=${OTS_IS_ALIVE}" - - # Test for start 4 times with short sleep - for t in {1..4}; do + # sleep 10 + + # Test for start 8 times with short sleep + for t in {1..8}; do if [ "x$OTS_IS_ALIVE" == "x" ]; then - sleep 1 + sleep 2 OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) + out "Checking http://${THIS_HOST}:${port}..." else break fi done + + + # have seen instances where http peer transport xdaq port is not established, but xdaq context still is constructed + # so clear alive, if ots script can not connect to port + OTS_CONTEXT_UP="$(curl http://${THIS_HOST}:${port} >/dev/null 2>&1 || echo \"bad\")" #detect if http server has gone up + # out "OTS_CONTEXT_UP=${OTS_CONTEXT_UP}" + if [ "$OTS_CONTEXT_UP" == "bad" ]; then + out "Context http port could not be verified at curl http://${THIS_HOST}:${port} >/dev/null 2>&1" + OTS_IS_ALIVE= + fi # Alert the user to the problem and keep relaunching @@ -1449,8 +1467,26 @@ launchOTS() { LAST_OTS_PID=$! #verify relaunch + sleep 2 + OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" + # Test for start 16 times with short sleep + for t in {1..16}; do + if [ "x$OTS_IS_ALIVE" == "x" ]; then + sleep 2 + OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) + out "Checking http://${THIS_HOST}:${port}..." + else + break + fi + done + sleep 8 #give time before relaunch test - OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat)" + OTS_CONTEXT_UP="$(curl http://${THIS_HOST}:${port} >/dev/null 2>&1 || echo \"bad\")" #detect if http server has gone up + out "OTS_CONTEXT_UP=${OTS_CONTEXT_UP}" + if [ "$OTS_CONTEXT_UP" == "bad" ]; then + out "Context http port could not be verified at curl http://${THIS_HOST}:${port} >/dev/null 2>&1" + OTS_IS_ALIVE= + fi #check if exit requested to avoid endless looping (e.g. on remote hosts) OTSDAQ_STARTOTS_ACTION="$(cat ${OTSDAQ_STARTOTS_ACTION_FILE})" @@ -1592,17 +1628,28 @@ launchOTS() { #test for relaunch # 4 times quietly, then get louder + sleep 2 OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) #out "OTS_IS_ALIVE=${OTS_IS_ALIVE}" - for t in {1..4}; do + for t in {1..8}; do if [ "x$OTS_IS_ALIVE" == "x" ]; then - sleep 1 - OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) + sleep 2 + OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) + out "Checking http://${THIS_HOST}:${port}..." else break fi done + + # have seen instances where http peer transport xdaq port is not established, but xdaq context still is constructed + # so clear alive, if ots script can not connect to port + OTS_CONTEXT_UP="$(curl http://${THIS_HOST}:${port} >/dev/null 2>&1 || echo \"bad\")" #detect if http server has gone up + # out "OTS_CONTEXT_UP=${OTS_CONTEXT_UP}" + if [ "$OTS_CONTEXT_UP" == "bad" ]; then + out "Context http port could not be verified at curl http://${THIS_HOST}:${port} >/dev/null 2>&1" + OTS_IS_ALIVE= + fi while [ "x$OTS_IS_ALIVE" == "x" ]; do warning "Could not verify successful ots non-gateway context launch, relaunching non-gateway context..." @@ -1624,8 +1671,24 @@ launchOTS() { fi #test for relaunch - sleep 8 #give time before relaunch test - OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat)" + sleep 2 + OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" + for t in {1..16}; do + if [ "x$OTS_IS_ALIVE" == "x" ]; then + sleep 2 + OTS_IS_ALIVE="$(cat ${OTSDAQ_LOG_DIR}/otsdaq_is_alive-${THIS_HOST}-${port}.dat 2>/dev/null)" #hide error messages, but not std out (std out is needed!) + out "Checking http://${THIS_HOST}:${port}..." + else + break + fi + done + + OTS_CONTEXT_UP="$(curl http://${THIS_HOST}:${port} >/dev/null 2>&1 || echo \"bad\")" #detect if http server has gone up + # out "OTS_CONTEXT_UP=${OTS_CONTEXT_UP}" + if [ "$OTS_CONTEXT_UP" == "bad" ]; then + out "Context http port could not be verified at curl http://${THIS_HOST}:${port} >/dev/null 2>&1" + OTS_IS_ALIVE= + fi #check if exit requested to avoid endless looping (e.g. on remote hosts) OTSDAQ_STARTOTS_ACTION="$(cat ${OTSDAQ_STARTOTS_ACTION_FILE})" @@ -2119,7 +2182,11 @@ otsActionHandler() { out "Starting up non-gateway contexts..." out " " - launchOTS nongateway #launch all non-gateway apps + #prevent remote ssh launches + TMP_REMOTELAUNCH=$REMOTELAUNCH + REMOTELAUNCH=0 + launchOTS nongateway #launch all non-gateway apps + REMOTELAUNCH=$TMP_REMOTELAUNCH elif [ "$OTSDAQ_STARTOTS_ACTION" == "LAUNCH_WIZ" ]; then diff --git a/tools/ots_remote_start b/tools/ots_remote_start index 127daf47..542ee546 100644 --- a/tools/ots_remote_start +++ b/tools/ots_remote_start @@ -8,7 +8,7 @@ SCRIPT_DIR="$( # Get color code variables source ${SCRIPT_DIR}/common.sh ots_remote_start -STARTTIME=`date +"%h%y.%T"` #to fully ID one StartOTS from another +STARTTIME=`date +"%d%h%y.%T"` #to fully ID one StartOTS from another out "${Green}=========================== ots REMOTE =============================" info "ots script path = ${SCRIPT_DIR}/ots_remote_start" info "Expected setup path = ${SCRIPT_DIR}/../../../../setup_ots.sh"