Permalink
Browse files

High: crmd: Improved handling of node up/down events

  • Loading branch information...
1 parent 9342a5a commit 9943bbb85c76b8769744562509b35a74bbe77e90 @beekhof committed Aug 31, 2012
Showing with 78 additions and 96 deletions.
  1. +78 −53 crmd/callbacks.c
  2. +0 −43 crmd/te_callbacks.c
View
@@ -111,8 +111,8 @@ lrm_op_callback(lrmd_event_data_t * op)
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
- gboolean reset_status_entry = FALSE;
uint32_t old = 0;
+ const char *status = NULL;
set_bit(fsa_input_register, R_PEER_DATA);
if (node->uname == NULL) {
@@ -121,80 +121,105 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
switch (type) {
case crm_status_uname:
- crm_info("status: %s is now %s", node->uname, node->state);
- /* reset_status_entry = TRUE; */
/* If we've never seen the node, then it also wont be in the status section */
- break;
+ crm_info("%s is now %s", node->uname, node->state);
+ return;
case crm_status_nstate:
- crm_info("status: %s is now %s (was %s)", node->uname, node->state, (const char *)data);
- reset_status_entry = TRUE;
+ crm_info("%s is now %s (was %s)", node->uname, node->state, (const char *)data);
+ if(safe_str_neq(data, node->state)) {
+ /* State did not change */
+ return;
+ }
break;
case crm_status_processes:
if (data) {
old = *(const uint32_t *)data;
}
- if ((node->processes ^ old) & proc_flags) {
- /* crmd_proc_update(node, proc_flags); */
- const char *status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
- crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)",
- node->uname, peer2text(proc_flags), status, AM_I_DC ? "true" : crm_str(fsa_our_dc));
-
- if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) {
- return;
- } else if (fsa_state == S_STOPPING) {
- return;
- }
-
- if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
- /* Did the DC leave us? */
- crm_info("Got client status callback - our DC is dead");
- register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);
-
- } else if (AM_I_DC) {
- xmlNode *update = NULL;
-
- if ((node->processes & proc_flags) == 0) {
- erase_node_from_join(node->uname);
- check_join_state(fsa_state, __FUNCTION__);
- fail_incompletable_actions(transition_graph, node->uuid);
-
- } else {
- register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
- }
-
- update = do_update_node_cib(node, node_update_peer, NULL, __FUNCTION__);
- fsa_cib_anon_update(
- XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create);
- free_xml(update);
- }
+ /* crmd_proc_update(node, proc_flags); */
+ status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS;
+ crm_info("Client %s/%s now has status [%s] (DC=%s)",
+ node->uname, peer2text(proc_flags), status, AM_I_DC ? "true" : crm_str(fsa_our_dc));
+
+ if (((node->processes ^ old) & proc_flags) == 0) {
+ /* Peer process did not change */
+ crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags);
+ return;
+ } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) {
+ crm_trace("Not connected");
+ return;
+ } else if (fsa_state == S_STOPPING) {
+ crm_trace("Stopping");
+ return;
+ }
- trigger_fsa(fsa_source);
+ if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) {
+ /* Did the DC leave us? */
+ crm_notice("Got client status callback - our DC is dead");
+ register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);
}
break;
}
- /* Can this be removed now that do_cl_join_finalize_respond() does the same thing?
- *
- * Dunno, but not the call to crm_update_peer_expected()
- */
- if (AM_I_DC && reset_status_entry && safe_str_eq(CRMD_JOINSTATE_MEMBER, node->state)) {
- crm_action_t *down = match_down_event(0, node->uname, NULL);
+ if (AM_I_DC) {
+ xmlNode *update = NULL;
+ crm_action_t *down = match_down_event(0, node->uuid, NULL);
+ gboolean alive = crm_is_peer_active(node);
+
+ if(alive && type == crm_status_processes) {
+ register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
+ }
- erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local);
- erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
+ crm_trace("Alive=%d, down=%p", alive, down);
+
if (down) {
const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);
- if (safe_str_eq(task, CRM_OP_FENCE)) {
- crm_info("Node return implies stonith of %s (action %d) completed", node->uname,
- down->id);
+ if (alive && safe_str_eq(task, CRM_OP_FENCE)) {
+ crm_info("Node return implies stonith of %s (action %d) completed", node->uname, down->id);
+ erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local);
+ erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local);
down->confirmed = TRUE;
+
+ } else if (safe_str_eq(task, CRM_OP_FENCE)) {
+ crm_trace("Waiting for stonithd to report the fencing of %s is complete", node->uname); /* via tengine_stonith_callback() */
+
+ } else if(alive == FALSE) {
+ crm_notice("%s of %s (op %d) is complete", task, node->uname, down->id);
+ down->confirmed = TRUE;
+ stop_te_timer(down->timer);
+
+ erase_node_from_join(node->uname);
+ crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN);
+ check_join_state(fsa_state, __FUNCTION__);
+
+ update_graph(transition_graph, down);
+ trigger_graph();
+
+ } else {
+ crm_trace("Other %p", down);
}
+
+ } else if(alive == FALSE) {
+ crm_notice("Stonith/shutdown of %s not matched", node->uname);
+
+ erase_node_from_join(node->uname);
+ crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN);
+ check_join_state(fsa_state, __FUNCTION__);
+
+ abort_transition(INFINITY, tg_restart, "Node failure", NULL);
+ fail_incompletable_actions(transition_graph, node->uuid);
+ } else {
+ crm_trace("Other %p", down);
}
- crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN);
+ update = do_update_node_cib(node, node_update_peer, NULL, __FUNCTION__);
+ fsa_cib_anon_update(
+ XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create);
+ free_xml(update);
}
+
+ trigger_fsa(fsa_source);
}
void
View
@@ -198,49 +198,6 @@ te_update_diff(const char *event, xmlNode * msg)
xmlXPathFreeObject(xpathObj);
}
- /* Check for node state updates... possibly from a shutdown we requested */
- xpathObj =
- xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_STATE);
- if (xpathObj) {
- int lpc = 0, max = xpathObj->nodesetval->nodeNr;
-
- for (lpc = 0; lpc < max; lpc++) {
- xmlNode *node = getXpathResult(xpathObj, lpc);
- const char *event_node = crm_element_value(node, XML_ATTR_ID);
- const char *event_uname = crm_element_value(node, XML_ATTR_UNAME);
- const char *is_peer = crm_element_value(node, XML_NODE_IS_PEER);
- /* Don't check the value of XML_NODE_IN_CLUSTER, only pacemaker may have been shut down */
- if (safe_str_eq(is_peer, OFFLINESTATUS)) {
- /* Pacemaker is now stopped/gone
- * Was it a shutdown or fencing operation?
- */
- crm_action_t *shutdown = match_down_event(0, event_node, NULL);
-
- if (shutdown != NULL) {
- const char *task = crm_element_value(shutdown->xml, XML_LRM_ATTR_TASK);
-
- if (safe_str_eq(task, CRM_OP_FENCE)) {
- crm_trace("Waiting for stonithd to report the fencing of %s is complete", event_uname); /* via tengine_stonith_callback() */
-
- } else {
- crm_debug("%s of %s (op %d) is complete", task, event_uname, shutdown->id);
- /* match->confirmed = TRUE; */
- stop_te_timer(shutdown->timer);
- erase_node_from_join(event_uname);
- update_graph(transition_graph, shutdown);
- trigger_graph();
- }
-
- } else {
- crm_info("Stonith/shutdown of %s not matched", event_node);
- abort_transition(INFINITY, tg_restart, "Node failure", node);
- }
- fail_incompletable_actions(transition_graph, event_node);
- }
- }
- xmlXPathFreeObject(xpathObj);
- }
-
/*
* Check for and fast-track the processing of LRM refreshes
* In large clusters this can result in _huge_ speedups

0 comments on commit 9943bbb

Please sign in to comment.