From 84e96d127c23ddab183c5a282e061c410287d5f7 Mon Sep 17 00:00:00 2001 From: Jiang Bian Date: Fri, 8 Nov 2013 01:01:17 -0600 Subject: [PATCH] handle dead crawler gracefully: cleanup residue resources; and schedule it to restart after 30 mins --- tweetf0rm/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tweetf0rm/scheduler.py b/tweetf0rm/scheduler.py index 2ba995c..05becd3 100644 --- a/tweetf0rm/scheduler.py +++ b/tweetf0rm/scheduler.py @@ -100,7 +100,7 @@ def crawler_status(self): status = [] for crawler_id in self.crawlers: cc = self.crawlers[crawler_id] - if (not cc['crawler'].is_alive()): + if ((not cc['crawler'].is_alive()) and time.time() - cc['retry_timer_start_ts'] > 1800): # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity" self.new_crawler(cc['apikeys'], self.config, cc['crawler_proxies']) status.append({crawler_id: cc['crawler'].is_alive(), 'qsize': len(cc['queue'])}) @@ -157,6 +157,7 @@ def enqueue(self, cmd): time.sleep(60) wait_timer -= 60 + self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time()) except Exception as exc: logger.error(full_stack()) else: