Skip to content

Commit

Permalink
Fix bug with starting build
Browse files Browse the repository at this point in the history
try/except ping codeblock
self.conn replaced to self.slave.conn because disconnecting old slave breaks reference
added few log messages
  • Loading branch information
Michael Mayorov authored and djmitche committed Sep 29, 2013
1 parent 0743967 commit f457aef
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 19 deletions.
1 change: 1 addition & 0 deletions master/buildbot/buildslave/base.py
Expand Up @@ -355,6 +355,7 @@ def attached(self, conn):
# We want to know when the graceful shutdown flag changes
self.slave_status.addGracefulWatcher(self._gracefulChanged)
self.conn = conn
self._old_builder_list = None # clear builder list before proceed

d = defer.succeed(None)

Expand Down
27 changes: 18 additions & 9 deletions master/buildbot/buildslave/manager.py
Expand Up @@ -97,19 +97,28 @@ def _unregister(self, registration):
@defer.inlineCallbacks
def newConnection(self, conn, buildslaveName):
if buildslaveName in self.connections:
log.msg("Got duplication connection from '%s'"
" starting arbitration procedure" % buildslaveName)
old_conn = self.connections[buildslaveName]
# returns:
# (None, 0) if ping was successfull, that means old connection stil alive
# (None, 1) if timeout expired and old slave didn't respond
res, pos = yield defer.DeferredList(
[old_conn.remotePrint("master got a duplicate connection"),
task.deferLater(reactor, self.PING_TIMEOUT, lambda : None)],
fireOnOneCallback=True
)
if pos == 0:
# if we get here then old connection still alives and new should
# be rejected
defer.returnValue(Failure(RuntimeError("rejecting duplicate slave")))
try:
res, pos = yield defer.DeferredList(
[old_conn.remotePrint("master got a duplicate connection"),
task.deferLater(reactor, self.PING_TIMEOUT, lambda : None)],
fireOnOneCallback=True
)
if pos == 0:
# if we get here then old connection still alives and new should
# be rejected
defer.returnValue(
Failure(RuntimeError("rejecting duplicate slave"))
)
except Exception, e:
log.msg("Got error while trying to ping connected slave %s:"
"%s" % (buildslaveName, e))
log.msg("Old connection for '%s' was lost, accepting new" % buildslaveName)

self.connections[buildslaveName] = conn
def remove():
Expand Down
5 changes: 3 additions & 2 deletions master/buildbot/buildslave/protocols/pb.py
Expand Up @@ -207,8 +207,9 @@ def _errback(why):
return defer.succeed(None)
yield old_way()

def remoteStartBuild(self):
return self.mind.callRemote('startBuild')
def remoteStartBuild(self, builder_name):
slavebuilder = self.builders.get(builder_name)
return slavebuilder.callRemote('startBuild')

def stopKeepaliveTimer(self):
if self.keepalive_timer and self.keepalive_timer.active():
Expand Down
2 changes: 1 addition & 1 deletion master/buildbot/process/build.py
Expand Up @@ -226,7 +226,7 @@ def startBuild(self, build_status, expectations, slavebuilder):
# then narrow SlaveLocks down to the right slave
self.locks = [(l.getLock(self.slavebuilder.slave), a)
for l, a in self.locks ]
self.conn = slavebuilder.conn
self.conn = slavebuilder.slave.conn
self.conn.notifyOnDisconnect(self.lostRemote) # TODO: save subscription

metrics.MetricCountEvent.log('active_builds', 1)
Expand Down
4 changes: 2 additions & 2 deletions master/buildbot/process/builder.py
Expand Up @@ -364,7 +364,7 @@ def run_cleanups():

# tell the remote that it's starting a build, too
try:
yield slavebuilder.conn.remoteStartBuild()
yield slavebuilder.slave.conn.remoteStartBuild(build.builder.name)
except:
log.err(failure.Failure(), 'while calling remote startBuild:')
run_cleanups()
Expand Down Expand Up @@ -392,7 +392,7 @@ def run_cleanups():
# and now. If so, bail out. The build.startBuild call below transfers
# responsibility for monitoring this connection to the Build instance,
# so this check ensures we hand off a working connection.
if not slavebuilder.conn: # TODO: replace with isConnected()
if not slavebuilder.slave.conn: # TODO: replace with isConnected()
log.msg("slave disappeared before build could start")
run_cleanups()
defer.returnValue(False)
Expand Down
7 changes: 2 additions & 5 deletions master/buildbot/process/slavebuilder.py
Expand Up @@ -29,7 +29,6 @@ class AbstractSlaveBuilder(pb.Referenceable):
def __init__(self):
self.ping_watchers = []
self.state = None # set in subclass
self.conn = None
self.slave = None
self.builder_name = None
self.locks = None
Expand Down Expand Up @@ -85,7 +84,6 @@ def attached(self, slave, commands):
@param commands: provides the slave's version of each RemoteCommand
"""
self.state = ATTACHING
self.conn = slave.conn
self.remoteCommands = commands # maps command name to version
if self.slave is None:
self.slave = slave
Expand All @@ -97,7 +95,7 @@ def attached(self, slave, commands):
d = defer.succeed(None)

d.addCallback(lambda _:
self.conn.remotePrint(message="attached"))
self.slave.conn.remotePrint(message="attached"))

def setIdle(res):
self.state = IDLE
Expand Down Expand Up @@ -131,7 +129,7 @@ def ping(self, status=None):
self.ping_watchers.insert(0, d2)
# I think it will make the tests run smoother if the status
# is updated before the ping completes
Ping().ping(self.conn).addCallback(self._pong)
Ping().ping(self.slave.conn).addCallback(self._pong)

def reset_state(res):
if self.state == PINGING:
Expand All @@ -158,7 +156,6 @@ def detached(self):
if self.slave:
self.slave.removeSlaveBuilder(self)
self.slave = None
self.conn = None
self.remoteCommands = None


Expand Down

0 comments on commit f457aef

Please sign in to comment.