Skip to content

Commit

Permalink
Merge pull request #964 from nacc/disable_netboot_on_failure
Browse files Browse the repository at this point in the history
Disable netboot on failure
  • Loading branch information
lmr committed Sep 9, 2015
2 parents f54ef40 + 664f11b commit 2b997b2
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
51 changes: 51 additions & 0 deletions scheduler/monitor_db_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@
import time
import logging
import random
import socket
from autotest.frontend.afe import models
from autotest.scheduler import scheduler_config
from autotest.client.shared import host_protections, mail
from autotest.client.shared.settings import settings
from autotest.server.hosts import remote


class InstallServerUnavailable(Exception):
pass


class PeriodicCleanup(object):
Expand Down Expand Up @@ -53,13 +60,55 @@ def _cleanup(self):
self._check_for_db_inconsistencies()
self._reverify_dead_hosts()

def _disable_host_installation(self, host):
server_info = remote.get_install_server_info()
if remote.install_server_is_configured():
timeout = settings.get_value('INSTALL_SERVER',
'default_install_timeout',
type=int,
default=3600)

end_time = time.time() + (timeout / 10)
step = int(timeout / 100)
ServerInterface = remote.RemoteHost.INSTALL_SERVER_MAPPING[server_info['type']]
server_interface = None
while time.time() < end_time:
try:
server_interface = ServerInterface(**server_info)
break
except socket.error:
logging.error('Install server unavailable. Trying '
'again in %s s...', step)
time.sleep(step)

if server_interface is None:
raise InstallServerUnavailable("%s install server at (%s) "
"unavailable. Tried to "
"communicate for %s s" %
(server_info['type'],
server_info['xmlrpc_url'],
timeout / 10))

server_interface._disable_host_installation(host)

def _abort_timed_out_jobs(self):
msg = 'Aborting all jobs that have timed out and are not complete'
logging.info(msg)
query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
where=['created_on + INTERVAL timeout HOUR < NOW()'])
for job in query.distinct():
logging.warning('Aborting job %d due to job timeout', job.id)
rows = self._db.execute("""
SELECT hqe.id
FROM afe_host_queue_entries AS hqe
INNER JOIN afe_jobs ON (hqe.job_id = %s)""" % job.id)
query2 = models.HostQueueEntry.objects.filter(
id__in=[row[0] for row in rows])
for queue_entry in query2.distinct():
# ensure we only disable installation on actually scheduled hosts
if queue_entry.host is not None:
logging.info('Disabling installation on %s' % queue_entry.host.hostname)
self._disable_host_installation(queue_entry.host)
job.abort()

def _abort_jobs_past_max_runtime(self):
Expand All @@ -77,6 +126,8 @@ def _abort_jobs_past_max_runtime(self):
id__in=[row[0] for row in rows])
for queue_entry in query.distinct():
logging.warning('Aborting entry %s due to max runtime', queue_entry)
logging.info('Disabling installation on %s' % queue_entry.host.hostname)
self._disable_host_installation(queue_entry.host.hostname)
queue_entry.abort()

def _check_for_db_inconsistencies(self):
Expand Down
22 changes: 22 additions & 0 deletions server/hosts/install_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,26 @@ def _set_host_profile(self, host, profile=''):
logging.error("DHCP sync failed, error code: %s, error string: %s",
err.faultCode, err.faultString)

def _disable_host_installation(self, host):
system, system_handle = self.get_system_handle(host)
system_info = self.server.get_system(system)

# Disable netboot for that machine (principle of least surprise on
# failure)
self.server.modify_system(system_handle, 'netboot_enabled', 'False',
self.token)
self.server.save_system(system_handle, self.token)
try:
# Cobbler only generates the DHCP configuration for netboot enabled
# machines, so we need to synchronize the dhcpd file after changing
# the value above
self.server.sync_dhcp(self.token)
except xmlrpclib.Fault, err:
# older Cobbler will not recognize the above command
if "unknown remote method" not in err.faultString:
logging.error("DHCP sync failed, error code: %s, error string: %s",
err.faultCode, err.faultString)

def install_host(self, host, profile='', timeout=None, num_attempts=2):
"""
Install a host object with profile name defined by distro.
Expand Down Expand Up @@ -163,6 +183,8 @@ def install_host(self, host, profile='', timeout=None, num_attempts=2):
if not install_successful:
e_msg = 'Host %s install timed out' % host.hostname
host.record("END FAIL", None, "install", e_msg)
self._disable_host_installation(host)

raise error.HostInstallTimeoutError(e_msg)

remove_hosts_file()
Expand Down

0 comments on commit 2b997b2

Please sign in to comment.