From 0cbc9e43de52e91396976c809ffdc06c9988e06d Mon Sep 17 00:00:00 2001 From: Jiang Bian Date: Sun, 10 Nov 2013 01:04:26 -0600 Subject: [PATCH] bug in recreating failed crawler --- tweetf0rm/scheduler.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tweetf0rm/scheduler.py b/tweetf0rm/scheduler.py index 9cf1429..35fa1de 100644 --- a/tweetf0rm/scheduler.py +++ b/tweetf0rm/scheduler.py @@ -107,11 +107,17 @@ def crawler_status(self): for crawler_id in self.crawlers: cc = self.crawlers[crawler_id] if ((not cc['crawler'].is_alive())): + if ('retry_timer_start_ts' in cc and (time.time() - cc['retry_timer_start_ts'] > 1800)): # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity" self.new_crawler(self.node_id, cc['apikeys'], self.config, cc['crawler_proxies']) + cc = self.crawlers[crawler_id] + logger.info('[%s] has been recrated...'%(crawler_id)) else: - cc['retry_timer_start_ts'] = int(time.time()) + if('retry_timer_start_ts' not in cc): + cc['retry_timer_start_ts'] = int(time.time()) + else: + logger.warn('[%s] failed; waiting to recreat in %f mins...'%(crawler_id, (time.time() + 1800 - cc['retry_timer_start_ts'])/float(60))) status.append({'crawler_id':crawler_id, 'alive?': cc['crawler'].is_alive(), 'qsize': cc['crawler_queue'].qsize(), 'crawler_queue_key': cc['crawler_queue'].get_key()})