Permalink
Browse files

handle dead crawler gracefully: cleanup residue resources; and schedu…

…le it to restart after 30 mins
  • Loading branch information...
1 parent 53286d2 commit 84e96d127c23ddab183c5a282e061c410287d5f7 @bianjiang committed Nov 8, 2013
Showing with 2 additions and 1 deletion.
  1. +2 −1 tweetf0rm/scheduler.py
View
@@ -100,7 +100,7 @@ def crawler_status(self):
status = []
for crawler_id in self.crawlers:
cc = self.crawlers[crawler_id]
- if (not cc['crawler'].is_alive()):
+ if ((not cc['crawler'].is_alive()) and time.time() - cc['retry_timer_start_ts'] > 1800): # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity"
self.new_crawler(cc['apikeys'], self.config, cc['crawler_proxies'])
status.append({crawler_id: cc['crawler'].is_alive(), 'qsize': len(cc['queue'])})
@@ -157,6 +157,7 @@ def enqueue(self, cmd):
time.sleep(60)
wait_timer -= 60
+ self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time())
except Exception as exc:
logger.error(full_stack())
else:

0 comments on commit 84e96d1

Please sign in to comment.