Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved FSM asynchronous error handling; improved Gateway app status… #84

Merged
merged 1 commit into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions otsdaq/ARTDAQSupervisor/ARTDAQSupervisor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class ARTDAQSupervisor : public CoreSupervisorBase
virtual std::string getStatusProgressDetail(void) override
{
std::lock_guard<std::mutex> lk(thread_mutex_);
// __COUTV__(thread_progress_message_);
return thread_progress_message_;
}

Expand Down
14 changes: 11 additions & 3 deletions otsdaq/FiniteStateMachine/FiniteStateMachine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ using namespace ots;
#define __MF_SUBJECT__ "FSM"
#define mfSubject_ std::string("FSM-") + getStateMachineName()

const std::string FiniteStateMachine::FAILED_STATE_NAME = "Failed";

//==============================================================================
FiniteStateMachine::FiniteStateMachine(const std::string& stateMachineName)
: stateEntranceTime_(0), inTransition_(false), provenanceState_('X'), theErrorMessage_(""), stateMachineName_(stateMachineName)
Expand Down Expand Up @@ -143,16 +145,21 @@ bool FiniteStateMachine::execTransition(const std::string& transition, const xoa

if(transition == "fail")
{
__GEN_COUT_INFO__ << "Failing now!!" << __E__;

while(inTransition_)
{
__GEN_COUT__ << "Currently in a transition executed from current state " << getProvenanceStateName()
__GEN_COUT__ << "Currently in transition '" << currentTransition_ << "' executed from current state " << getProvenanceStateName()
<< ". Attempting to wait for the transition to complete." << __E__;
sleep(1);
}
sleep(1);

if(getStateName(getCurrentState()) == FiniteStateMachine::FAILED_STATE_NAME)
{
__GEN_COUT_INFO__ << "Already failed. Current state: " << getStateName(getCurrentState()) << " last state: " << getProvenanceStateName() << __E__;
return true;
}
__GEN_COUT_INFO__ << "Failing now!! Current state: " << getStateName(getCurrentState()) << " last state: " << getProvenanceStateName() << __E__;

// find any valid transition and take it..
// all transition functions must check for a failure
// flag, and throw an exception to go to Fail state
Expand All @@ -161,6 +168,7 @@ bool FiniteStateMachine::execTransition(const std::string& transition, const xoa
for(const auto& transitionPair : transitions)
{
__GEN_COUT__ << "Taking transition to indirect failure: " << transitionPair.first << __E__;
__GEN_COUT__ << "Taking fail transition from Current state: " << getStateName(getCurrentState()) << " last state: " << getProvenanceStateName() << __E__;
toolbox::Event::Reference event(new toolbox::Event(transitionPair.first, this));

try
Expand Down
2 changes: 2 additions & 0 deletions otsdaq/FiniteStateMachine/FiniteStateMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class FiniteStateMachine : public toolbox::fsm::FiniteStateMachine
void setInitialState (toolbox::fsm::State state);
void setErrorMessage (const std::string& errMessage, bool append = true);

static const std::string FAILED_STATE_NAME;
protected:
time_t stateEntranceTime_;

Expand All @@ -89,6 +90,7 @@ class FiniteStateMachine : public toolbox::fsm::FiniteStateMachine
xoap::MessageReference theMessage_;
std::string theErrorMessage_;
std::string stateMachineName_;


private:
};
Expand Down
12 changes: 11 additions & 1 deletion otsdaq/FiniteStateMachine/RunControlStateMachine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

using namespace ots;

const std::string RunControlStateMachine::FAILED_STATE_NAME = "Failed";
const std::string RunControlStateMachine::FAILED_STATE_NAME = FiniteStateMachine::FAILED_STATE_NAME;
const std::string RunControlStateMachine::HALTED_STATE_NAME = "Halted";
const std::string RunControlStateMachine::PAUSED_STATE_NAME = "Paused";
const std::string RunControlStateMachine::RUNNING_STATE_NAME = "Running";
Expand Down Expand Up @@ -315,8 +315,18 @@ xoap::MessageReference RunControlStateMachine::runControlMessageHandler(xoap::Me
return SOAPUtilities::makeSOAPMessageReference(result);
}

if(command == "Halt" && currentState == RunControlStateMachine::FAILED_STATE_NAME)
{
__GEN_COUT__ << "Clearing Errors after failure..." << std::endl;
theStateMachine_.setErrorMessage("", false /*append*/); // clear error message
asyncFailureReceived_ = false;
}

__GEN_COUTV__(command);
__GEN_COUTV__(currentState);
__GEN_COUTV__(asyncFailureReceived_);
__GEN_COUTV__(asyncPauseExceptionReceived_);
__GEN_COUTV__(asyncStopExceptionReceived_);
__GEN_COUTV__(getErrorMessage());
__GEN_COUTV__(retransmittedCommand);

Expand Down
115 changes: 90 additions & 25 deletions otsdaq/GatewaySupervisor/GatewaySupervisor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,19 @@ void GatewaySupervisor::init(void)
std::thread([](GatewaySupervisor* s) { GatewaySupervisor::AppStatusWorkLoop(s); }, this).detach();
}
else
{
__COUT__ << "App Status checking is disabled." << __E__;

//set all app status to "Not Monitored" so that FSM changes ignore missing app status
for(const auto& it : allSupervisorInfo_.getAllSupervisorInfo())
{
auto appInfo = it.second;
allSupervisorInfo_.setSupervisorStatus(appInfo,
SupervisorInfo::APP_STATUS_NOT_MONITORED,
0 /* progressInteger */, "" /* detail */);
}
}

} // end checking of Application Status

} // end init()
Expand All @@ -192,7 +204,10 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
bool firstError = true;
std::string status, progress, detail, appName;
int progressInteger;
bool oneStatusReqHasFailed = false;
bool oneStatusReqHasFailed = false;

std::map<std::string /* appName */, bool /* lastStatusGood */> appLastStatusGood;

while(1)
{
sleep(1);
Expand All @@ -208,7 +223,7 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
auto appInfo = it.second;
appName = appInfo.getName();
// __COUT__ << "Getting Status "
// << " Supervisor instance = '" << appInfo.getName()
// << " Supervisor instance = '" << appName
// << "' [LID=" << appInfo.getId() << "] in Context '"
// << appInfo.getContextName() << "' [URL=" <<
// appInfo.getURL()
Expand Down Expand Up @@ -277,10 +292,13 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
{
xoap::MessageReference statusMessage = theSupervisor->sendWithSOAPReply(appInfo.getDescriptor(), tempMessage);

// __COUT__ << "statusMessage... "
// <<
// SOAPUtilities::translate(statusMessage)
// << std::endl;
// if("ContextARTDAQ" == appInfo.getContextName() )
// __COUT__ << " Supervisor instance = '" << appName
// << "' [LID=" << appInfo.getId() << "] in Context '"
// << appInfo.getContextName() << " statusMessage... "
// <<
// SOAPUtilities::translate(statusMessage)
// << std::endl;

SOAPParameters parameters;
parameters.addParameter("Status");
Expand All @@ -296,7 +314,22 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
if(progress.empty())
progress = "100";

// if("ContextARTDAQ" == appInfo.getContextName() )
// __COUTV__(progress);

detail = parameters.getValue("Detail");

if(!appLastStatusGood[appName])
{
__COUT__ << "First Good Status from "
<< " Supervisor instance = '" << appName
<< "' [LID=" << appInfo.getId() << "] in Context '"
<< appInfo.getContextName() << "' [URL=" <<
appInfo.getURL()
<< "].\n\n";
__COUTV__(SOAPUtilities::translate(tempMessage));
}
appLastStatusGood[appName] = true;
}
catch(const xdaq::exception::Exception& e)
{
Expand All @@ -309,11 +342,18 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
firstError = false;
break;
}
__COUT__ << "Getting Status "
<< " Supervisor instance = '" << appInfo.getName() << "' [LID=" << appInfo.getId() << "] in Context '" << appInfo.getContextName()
<< "' [URL=" << appInfo.getURL() << "].\n\n";
__COUTV__(SOAPUtilities::translate(tempMessage));
__COUT_WARN__ << "Failed to send getStatus SOAP Message: " << e.what() << __E__;
if(appLastStatusGood[appName])
{
__COUT__ << "Getting Status "
<< " Supervisor instance = '" << appName
<< "' [LID=" << appInfo.getId() << "] in Context '"
<< appInfo.getContextName() << "' [URL=" <<
appInfo.getURL()
<< "].\n\n";
__COUTV__(SOAPUtilities::translate(tempMessage));
__COUT_WARN__ << "Failed to send getStatus SOAP Message - will suppress repeat errors: " << e.what() << __E__;
} //else quiet repeat error messages
appLastStatusGood[appName] = false;
}
catch(...)
{
Expand All @@ -326,11 +366,18 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
firstError = false;
break;
}
__COUT__ << "Getting Status "
<< " Supervisor instance = '" << appInfo.getName() << "' [LID=" << appInfo.getId() << "] in Context '" << appInfo.getContextName()
<< "' [URL=" << appInfo.getURL() << "].\n\n";
__COUTV__(SOAPUtilities::translate(tempMessage));
__COUT_WARN__ << "Failed to send getStatus SOAP Message due to unknown error." << __E__;
if(appLastStatusGood[appName])
{
__COUT__ << "Getting Status "
<< " Supervisor instance = '" << appName
<< "' [LID=" << appInfo.getId() << "] in Context '"
<< appInfo.getContextName() << "' [URL=" <<
appInfo.getURL()
<< "].\n\n";
__COUTV__(SOAPUtilities::translate(tempMessage));
__COUT_WARN__ << "Failed to send getStatus SOAP Message due to unknown error. Will suppress repeat errors." << __E__;
} //else quiet repeat error messages
appLastStatusGood[appName] = false;
}
} // end with non-gateway status request handling

Expand All @@ -343,6 +390,9 @@ void GatewaySupervisor::AppStatusWorkLoop(GatewaySupervisor* theSupervisor)
std::istringstream ssProgress(progress);
ssProgress >> progressInteger;

// if("ContextARTDAQ" == appInfo.getContextName() )
// __COUTV__(progressInteger);

theSupervisor->allSupervisorInfo_.setSupervisorStatus(appInfo, status, progressInteger, detail);

} // end of app loop
Expand Down Expand Up @@ -1318,13 +1368,13 @@ void GatewaySupervisor::stateConfigured(toolbox::fsm::FiniteStateMachine& /*fsm*
//==============================================================================
void GatewaySupervisor::inError(toolbox::fsm::FiniteStateMachine& /*fsm*/)
{
__COUT__ << "Fsm current state: "
<< "Failed" <<
// theStateMachine_.getCurrentStateName() //There may be a race condition here
__COUT__ << "Error occured - FSM current state: "
<< "Failed? = " <<
theStateMachine_.getCurrentStateName() << //There may be a race condition here
// when async errors occur (e.g. immediately in running)
" from " << theStateMachine_.getProvenanceStateName() << __E__;

__COUTV__(SOAPUtilities::translate(theStateMachine_.getCurrentMessage()).getCommand());
__COUT__ << "Error occured on command: " << (SOAPUtilities::translate(theStateMachine_.getCurrentMessage()).getCommand()) << __E__;

// if coming from Running or Paused, update Run Info w/ERROR
if(theStateMachine_.getProvenanceStateName() == RunControlStateMachine::RUNNING_STATE_NAME ||
Expand Down Expand Up @@ -1426,10 +1476,10 @@ void GatewaySupervisor::enteringError(toolbox::Event::Reference e)

theStateMachine_.setErrorMessage(ss.str());

if(theStateMachine_.getCurrentStateName() == RunControlStateMachine::FAILED_STATE_NAME)
__COUT__ << "Already in failed state, so not broadcasting Error transition again." << __E__;
else // move everything else to Error!
broadcastMessage(SOAPUtilities::makeSOAPMessageReference("Error"));
// if(theStateMachine_.getCurrentStateName() == RunControlStateMachine::FAILED_STATE_NAME)
// __COUT__ << "Already in failed state, so not broadcasting Error transition again." << __E__;
// else // move everything else to Error!
broadcastMessage(SOAPUtilities::makeSOAPMessageReference("Error"));
} // end enteringError()

//==============================================================================
Expand Down Expand Up @@ -1768,7 +1818,10 @@ catch(...)
void GatewaySupervisor::transitionShuttingDown(toolbox::Event::Reference /*e*/)
try
{
__COUT__ << "Fsm current state: " << theStateMachine_.getCurrentStateName() << " message: " << theStateMachine_.getCurrentStateName() << __E__;
checkForAsyncError();

__COUT__ << "transitionShuttingDown -- Fsm current state: " << theStateMachine_.getCurrentStateName() <<
" message: " << theStateMachine_.getCurrentStateName() << __E__;

RunControlStateMachine::theProgressBar_.step();
makeSystemLogEntry("System shutting down.");
Expand Down Expand Up @@ -2353,11 +2406,23 @@ bool GatewaySupervisor::handleBroadcastMessageTarget(const SupervisorInfo& appI

__COUTV__(appInfo.getStatus());
//wait for app to exist in status before sending commands
int waitAttempts = 0;
while(appInfo.getStatus() == SupervisorInfo::APP_STATUS_UNKNOWN)
{
__COUT__ << "Broadcast thread " << threadIndex << "\t"
<< "Waiting for Supervisor " << appInfo.getName() << " [LID=" << appInfo.getId() << "] in unknown state." << __E__;
++waitAttempts;
if(waitAttempts == 10)
{
__SS__ << "Error! Gateway Supervisor failed to send message to app in unknown state "
"Supervisor instance = '"
<< appInfo.getName() << "' [LID=" << appInfo.getId() << "] in Context '" << appInfo.getContextName() << "' [URL=" << appInfo.getURL()
<< "].\n\n";
__COUT_ERR__ << ss.str();
XCEPT_RAISE(toolbox::fsm::exception::Exception, ss.str());
}
sleep(2);

}

// start recursive mutex scope (same thread can lock multiple times, but needs to unlock the same)
Expand Down
18 changes: 9 additions & 9 deletions otsdaq/Macros/StringMacros.cc
Original file line number Diff line number Diff line change
Expand Up @@ -933,7 +933,7 @@ bool StringMacros::extractCommonChunks(const std::vector<std::string>& haystack,
else
break;
}
__COUT__ << "Low side = " << wildcardBounds.first << " " << haystack[0].substr(0, wildcardBounds.first) << __E__;
// __COUT__ << "Low side = " << wildcardBounds.first << " " << haystack[0].substr(0, wildcardBounds.first) << __E__;

// look for end matching segment
for(unsigned int n = 1; n < haystack.size(); ++n)
Expand All @@ -951,7 +951,7 @@ bool StringMacros::extractCommonChunks(const std::vector<std::string>& haystack,
break;
}

__COUT__ << "High side = " << wildcardBounds.second << " " << haystack[0].substr(wildcardBounds.second) << __E__;
// __COUT__ << "High sibbde = " << wildcardBounds.second << " " << haystack[0].substr(wildcardBounds.second) << __E__;

// add first common chunk
commonChunksToReturn.push_back(haystack[0].substr(0, wildcardBounds.first));
Expand Down Expand Up @@ -1024,7 +1024,7 @@ bool StringMacros::extractCommonChunks(const std::vector<std::string>& haystack,
__SS_THROW__;
}
}
__COUTV__(fixedWildcardLength);
// __COUTV__(fixedWildcardLength);

if(fixedWildcardLength) // take trailing 0s out of common chunks
for(unsigned int c = 0; c < commonChunksToReturn.size(); ++c)
Expand Down Expand Up @@ -1059,15 +1059,15 @@ bool StringMacros::extractCommonChunks(const std::vector<std::string>& haystack,
if(wildcard == "")
{
// set wildcard for first time
__COUTV__(i);
__COUTV__(k);
__COUTV__(k - i);
// __COUTV__(i);
// __COUTV__(k);
// __COUTV__(k - i);

wildcard = haystack[n].substr(i, k - i);
if(fixedWildcardLength && n == 0)
fixedWildcardLength += wildcard.size();

__COUT__ << "name[" << n << "] = " << wildcard << " fixed @ " << fixedWildcardLength << __E__;
// __COUT__ << "name[" << n << "] = " << wildcard << " fixed @ " << fixedWildcardLength << __E__;

break;
}
Expand All @@ -1087,8 +1087,8 @@ bool StringMacros::extractCommonChunks(const std::vector<std::string>& haystack,

} // end name loop

__COUTV__(StringMacros::vectorToString(commonChunksToReturn));
__COUTV__(StringMacros::vectorToString(wildcardStringsToReturn));
// __COUTV__(StringMacros::vectorToString(commonChunksToReturn));
// __COUTV__(StringMacros::vectorToString(wildcardStringsToReturn));

if(wildcardStringsToReturn.size() != haystack.size())
{
Expand Down
5 changes: 5 additions & 0 deletions otsdaq/ProgressBar/ProgressBar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ void ProgressBar::step()
{
std::lock_guard<std::mutex> lock(theMutex_); // lock out for remainder of scope
++stepCount_;

//do not allow to get to 100% until complete (in case stepsToComplete is not constant for all time)
if(stepsToComplete_ && stepCount_ >= stepsToComplete_)
stepsToComplete_ = stepCount_ + 1;

// std::cout << __COUT_HDR_FL__ << totalStepsFileName_ << " " <<
// readPercentageString() << "% complete" << std::endl;
}
Expand Down
3 changes: 2 additions & 1 deletion otsdaq/SupervisorInfo/SupervisorInfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

using namespace ots;

const std::string SupervisorInfo::APP_STATUS_UNKNOWN = "Unknown";
const std::string SupervisorInfo::APP_STATUS_UNKNOWN = "Unknown";
const std::string SupervisorInfo::APP_STATUS_NOT_MONITORED = "Not Monitored";

//=====================================================================================
void SupervisorInfo::setStatus(const std::string& status, const unsigned int progress, const std::string& detail)
Expand Down
1 change: 1 addition & 0 deletions otsdaq/SupervisorInfo/SupervisorInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class SupervisorInfo


static const std::string APP_STATUS_UNKNOWN;
static const std::string APP_STATUS_NOT_MONITORED;

// BOOLs -------------------
bool isGatewaySupervisor (void) const { return class_ == XDAQContextTable::GATEWAY_SUPERVISOR_CLASS; }
Expand Down
Loading