Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only update powerstate if sure it is the latest #3743

Merged
merged 2 commits into from Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -1242,7 +1242,7 @@ protected void processRequest(final Link link, final Request request) {
if (s_logger.isDebugEnabled()) {
if (cmd instanceof PingRoutingCommand) {
logD = false;
s_logger.debug("Ping from " + hostId + "(" + hostName + ")");
s_logger.debug("Ping from Routing host " + hostId + "(" + hostName + ")");
s_logger.trace("SeqA " + hostId + "-" + request.getSequence() + ": Processing " + request);
} else if (cmd instanceof PingCommand) {
logD = false;
Expand Down
Expand Up @@ -4212,7 +4212,8 @@ protected void HandlePowerStateReport(final String subject, final String senderA
break;

case PowerOff:
case PowerReportMissing:
case PowerReportMissing: // rigorously set to Migrating? or just do nothing until...? or create a missing state?
// for now handle in line with legacy as power off
handlePowerOffReportWithNoPendingJobsOnVM(vm);
break;

Expand Down Expand Up @@ -4323,8 +4324,15 @@ private void handlePowerOffReportWithNoPendingJobsOnVM(final VMInstanceVO vm) {
case Running:
case Stopped:
case Migrating:
s_logger.info("VM " + vm.getInstanceName() + " is at " + vm.getState() + " and we received a power-off report while there is no pending jobs on it");
if(vm.isHaEnabled() && vm.getState() == State.Running && HaVmRestartHostUp.value() && vm.getHypervisorType() != HypervisorType.VMware && vm.getHypervisorType() != HypervisorType.Hyperv) {
if (s_logger.isInfoEnabled()) {
s_logger.info(
String.format("VM %s is at %s and we received a %s report while there is no pending jobs on it"
, vm.getInstanceName(), vm.getState(), vm.getPowerState()));
}
if(vm.isHaEnabled() && vm.getState() == State.Running
&& HaVmRestartHostUp.value()
&& vm.getHypervisorType() != HypervisorType.VMware
&& vm.getHypervisorType() != HypervisorType.Hyperv) {
s_logger.info("Detected out-of-band stop of a HA enabled VM " + vm.getInstanceName() + ", will schedule restart");
if(!_haMgr.hasPendingHaWork(vm.getId())) {
_haMgr.scheduleRestart(vm, true);
Expand All @@ -4334,11 +4342,14 @@ private void handlePowerOffReportWithNoPendingJobsOnVM(final VMInstanceVO vm) {
return;
}

final VirtualMachineGuru vmGuru = getVmGuru(vm);
final VirtualMachineProfile profile = new VirtualMachineProfileImpl(vm);
if (!sendStop(vmGuru, profile, true, true)) {
// In case StopCommand fails, don't proceed further
return;
// not when report is missing
if(PowerState.PowerOff.equals(vm.getPowerState())) {
final VirtualMachineGuru vmGuru = getVmGuru(vm);
final VirtualMachineProfile profile = new VirtualMachineProfileImpl(vm);
if (!sendStop(vmGuru, profile, true, true)) {
// In case StopCommand fails, don't proceed further
return;
}
}

try {
Expand Down
Expand Up @@ -38,7 +38,6 @@ public class VirtualMachinePowerStateSyncImpl implements VirtualMachinePowerStat

@Inject MessageBus _messageBus;
@Inject VMInstanceDao _instanceDao;
@Inject VirtualMachineManager _vmMgr;
@Inject ManagementServiceConfiguration mgmtServiceConf;

public VirtualMachinePowerStateSyncImpl() {
Expand Down Expand Up @@ -69,25 +68,30 @@ public void processHostVmStatePingReport(long hostId, Map<String, HostVmStateRep

private void processReport(long hostId, Map<Long, VirtualMachine.PowerState> translatedInfo) {

if (s_logger.isDebugEnabled())
if (s_logger.isDebugEnabled()) {
s_logger.debug("Process VM state report. host: " + hostId + ", number of records in report: " + translatedInfo.size());
}

for (Map.Entry<Long, VirtualMachine.PowerState> entry : translatedInfo.entrySet()) {

if (s_logger.isDebugEnabled())
s_logger.debug("VM state report. host: " + hostId + ", vm id: " + entry.getKey() + ", power state: " + entry.getValue());

if (_instanceDao.updatePowerState(entry.getKey(), hostId, entry.getValue())) {
if (s_logger.isDebugEnabled())
s_logger.debug("VM state report is updated. host: " + hostId + ", vm id: " + entry.getKey() + ", power state: " + entry.getValue());
if (_instanceDao.updatePowerState(entry.getKey(), hostId, entry.getValue(), DateUtil.currentGMTTime())) {
if (s_logger.isInfoEnabled()) {
s_logger.info("VM state report is updated. host: " + hostId + ", vm id: " + entry.getKey() + ", power state: " + entry.getValue());
}

_messageBus.publish(null, VirtualMachineManager.Topics.VM_POWER_STATE, PublishScope.GLOBAL, entry.getKey());
} else {
if (s_logger.isDebugEnabled())
s_logger.debug("VM power state does not change, skip DB writing. vm id: " + entry.getKey());
if (s_logger.isTraceEnabled()) {
s_logger.trace("VM power state does not change, skip DB writing. vm id: " + entry.getKey());
}
}
}

// any state outdates should be checked against the time before this list was retrieved
Date startTime = DateUtil.currentGMTTime();
// for all running/stopping VMs, we provide monitoring of missing report
List<VMInstanceVO> vmsThatAreMissingReport = _instanceDao.findByHostInStates(hostId, VirtualMachine.State.Running,
VirtualMachine.State.Stopping, VirtualMachine.State.Starting);
Expand All @@ -98,10 +102,12 @@ private void processReport(long hostId, Map<Long, VirtualMachine.PowerState> tra
it.remove();
}

// here we need to be wary of out of band migration as opposed to other, more unexpected state changes
if (vmsThatAreMissingReport.size() > 0) {
Date currentTime = DateUtil.currentGMTTime();
if (s_logger.isDebugEnabled())
if (s_logger.isDebugEnabled()) {
s_logger.debug("Run missing VM report. current time: " + currentTime.getTime());
}

// 2 times of sync-update interval for graceful period
long milliSecondsGracefullPeriod = mgmtServiceConf.getPingInterval() * 2000L;
Expand Down Expand Up @@ -130,23 +136,34 @@ private void processReport(long hostId, Map<Long, VirtualMachine.PowerState> tra
}
}

if (s_logger.isDebugEnabled())
s_logger.debug("Detected missing VM. host: " + hostId + ", vm id: " + instance.getId() +
", power state: PowerReportMissing, last state update: " + vmStateUpdateTime.getTime());
if (s_logger.isInfoEnabled()) {
s_logger.info(
String.format("Detected missing VM. host: %l, vm id: %l(%s), power state: %s, last state update: %l"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @DaanHoogland, I'm getting this error in latest master:

INFO  [c.c.a.m.AgentManagerImpl] (AgentTaskPool-98:ctx-ad691272) (logid:1a451217) The agent from host 1 state determined is Up
INFO  [c.c.a.m.AgentManagerImpl] (AgentTaskPool-98:ctx-ad691272) (logid:1a451217) Agent is determined to be up and running
WARN  [c.c.a.m.AgentManagerImpl] (AgentManager-Handler-8:null) (logid:) Caught: 
java.util.UnknownFormatConversionException: Conversion = 'l'
	at java.util.Formatter$FormatSpecifier.conversion(Formatter.java:2691)
	at java.util.Formatter$FormatSpecifier.<init>(Formatter.java:2720)
	at java.util.Formatter.parse(Formatter.java:2560)
	at java.util.Formatter.format(Formatter.java:2501)
	at java.util.Formatter.format(Formatter.java:2455)
	at java.lang.String.format(String.java:2940)
	at com.cloud.vm.VirtualMachinePowerStateSyncImpl.processReport(VirtualMachinePowerStateSyncImpl.java:141)
	at com.cloud.vm.VirtualMachinePowerStateSyncImpl.processHostVmStatePingReport(VirtualMachinePowerStateSyncImpl.java:66)
	at com.cloud.vm.VirtualMachineManagerImpl.processCommands(VirtualMachineManagerImpl.java:3245)
	at com.cloud.agent.manager.AgentManagerImpl.handleCommands(AgentManagerImpl.java:309)
	at com.cloud.agent.manager.AgentManagerImpl$AgentHandler.processRequest(AgentManagerImpl.java:1287)
	at com.cloud.agent.manager.AgentManagerImpl$AgentHandler.doTask(AgentManagerImpl.java:1370)
	at com.cloud.agent.manager.ClusteredAgentManagerImpl$ClusteredAgentHandler.doTask(ClusteredAgentManagerImpl.java:702)
	at com.cloud.utils.nio.Task.call(Task.java:83)
	at com.cloud.utils.nio.Task.call(Task.java:29)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, i know, fixed in #3806

, hostId
, instance.getId()
, instance.getUuid()
, VirtualMachine.PowerState.PowerReportMissing
, vmStateUpdateTime.getTime()));
}

long milliSecondsSinceLastStateUpdate = currentTime.getTime() - vmStateUpdateTime.getTime();

if (milliSecondsSinceLastStateUpdate > milliSecondsGracefullPeriod) {
s_logger.debug("vm id: " + instance.getId() + " - time since last state update(" + milliSecondsSinceLastStateUpdate + "ms) has passed graceful period");

if (_instanceDao.updatePowerState(instance.getId(), hostId, VirtualMachine.PowerState.PowerReportMissing)) {
if (s_logger.isDebugEnabled())
// this is were a race condition might have happened if we don't re-fetch the instance;
// between the startime of this job and the currentTime of this missing-branch
// an update might have occurred that we should not override in case of out of band migration
if (_instanceDao.updatePowerState(instance.getId(), hostId, VirtualMachine.PowerState.PowerReportMissing, startTime)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("VM state report is updated. host: " + hostId + ", vm id: " + instance.getId() + ", power state: PowerReportMissing ");
}

_messageBus.publish(null, VirtualMachineManager.Topics.VM_POWER_STATE, PublishScope.GLOBAL, instance.getId());
} else {
if (s_logger.isDebugEnabled())
if (s_logger.isDebugEnabled()) {
s_logger.debug("VM power state does not change, skip DB writing. vm id: " + instance.getId());
}
}
} else {
s_logger.debug("vm id: " + instance.getId() + " - time since last state update(" + milliSecondsSinceLastStateUpdate + "ms) has not passed graceful period yet");
Expand Down
Expand Up @@ -139,7 +139,7 @@ public interface VMInstanceDao extends GenericDao<VMInstanceVO, Long>, StateDao<

List<VMInstanceVO> listStartingWithNoHostId();

boolean updatePowerState(long instanceId, long powerHostId, VirtualMachine.PowerState powerState);
boolean updatePowerState(long instanceId, long powerHostId, VirtualMachine.PowerState powerState, Date wisdomEra);

void resetVmPowerStateTracking(long instanceId);

Expand Down
Expand Up @@ -828,13 +828,15 @@ public List<VMInstanceVO> listStartingWithNoHostId() {
}

@Override
public boolean updatePowerState(final long instanceId, final long powerHostId, final VirtualMachine.PowerState powerState) {
public boolean updatePowerState(final long instanceId, final long powerHostId, final VirtualMachine.PowerState powerState, Date wisdomEra) {
return Transaction.execute(new TransactionCallback<Boolean>() {
@Override
public Boolean doInTransaction(TransactionStatus status) {
boolean needToUpdate = false;
VMInstanceVO instance = findById(instanceId);
if (instance != null) {
if (instance != null
&& (null == instance.getPowerStateUpdateTime()
|| instance.getPowerStateUpdateTime().before(wisdomEra))) {
Long savedPowerHostId = instance.getPowerHostId();
if (instance.getPowerState() != powerState || savedPowerHostId == null
|| savedPowerHostId.longValue() != powerHostId) {
Expand Down
Expand Up @@ -3817,7 +3817,7 @@ protected Answer execute(StopCommand cmd) {
}
} else {

String msg = "VM " + cmd.getVmName() + " is no longer in vSphere";
String msg = "VM " + cmd.getVmName() + " is no longer on the expected host in vSphere";
s_logger.info(msg);
return new StopAnswer(cmd, msg, true);
}
Expand Down