Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| # logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s') | |
| # requests_log = logging.getLogger("requests") | |
| # requests_log.setLevel(logging.WARNING) | |
| import json, copy, time | |
| from tweetf0rm.utils import full_stack, hash_cmd, md5, get_keys_by_min_value | |
| from tweetf0rm.proxies import proxy_checker | |
| from process.user_relationship_crawler import UserRelationshipCrawler | |
| #from handler.inmemory_handler import InMemoryHandler | |
| from handler import create_handler | |
| from tweetf0rm.redis_helper import NodeCoordinator, NodeQueue | |
| import twython, pprint | |
| class Scheduler(object): | |
| def __init__(self, node_id, config={}, proxies=[]): | |
| self.node_id = node_id | |
| self.config = config | |
| if (len(proxies) > 0): | |
| self.proxy_list = proxy_checker(proxies) | |
| logger.info("number of live proxies: %d"%(len(self.proxy_list))) | |
| # each process only get one apikey... if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. | |
| number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list)) | |
| # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy | |
| self.proxy_generator = self.split(self.proxy_list, number_of_processes) | |
| else: | |
| self.proxy_list = None | |
| self.proxy_generator = None | |
| number_of_processes = 1 | |
| logger.info("number of crawlers: %d"%(number_of_processes)) | |
| apikey_list = self.config['apikeys'].keys() | |
| self.crawlers = {} | |
| for idx in range(number_of_processes): | |
| try: | |
| self.new_crawler(self.config['apikeys'][apikey_list[idx]], config) | |
| except: | |
| pass | |
| self.node_coordinator = NodeCoordinator(config['redis_config']) | |
| self.node_coordinator.add_node(node_id) | |
| logger.info("number of crawlers: %d created"%(number_of_processes)) | |
| def new_crawler(self, apikeys, config, crawler_proxies = None): | |
| file_handler_config = { | |
| "name": "FileHandler", | |
| "args": { | |
| "output_folder" : config["output"] | |
| } | |
| } | |
| try: | |
| #crawler_id = md5('%s:%s'%(self.node_id, idx)) | |
| #apikeys = self.config['apikeys'][apikey_list[idx]] | |
| crawler_id = apikeys['app_key'] | |
| logger.debug('creating a new crawler: %s'%crawler_id) | |
| if (not crawler_proxies): | |
| crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None | |
| crawler = UserRelationshipCrawler(self.node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies) | |
| self.crawlers[crawler_id] = { | |
| 'apikeys': apikeys, | |
| 'crawler': crawler, | |
| 'queue': {}, | |
| 'crawler_proxies': crawler_proxies | |
| } | |
| crawler.start() | |
| except twython.exceptions.TwythonAuthError as exc: | |
| logger.error('%s: %s'%(exc, apikeys)) | |
| except: | |
| raise | |
| def is_alive(self): | |
| a = [1 if self.crawlers[crawler_id]['crawler'].is_alive() else 0 for crawler_id in self.crawlers] | |
| return sum(a) > 0 | |
| def crawler_status(self): | |
| status = [] | |
| for crawler_id in self.crawlers: | |
| cc = self.crawlers[crawler_id] | |
| if (not cc['crawler'].is_alive()): | |
| self.new_crawler(cc['apikeys'], self.config, cc['crawler_proxies']) | |
| status.append({crawler_id: cc['crawler'].is_alive(), 'qsize': len(cc['queue'])}) | |
| return status | |
| #return [{crawler_id: True, 'qsize': len(self.crawlers[crawler_id]['queue'])} else {crawler_id: False} for crawler_id in self.crawlers] | |
| def distribute_to(self): | |
| current_qsize = None | |
| current_crawler_id = None | |
| for crawler_id in self.crawlers: | |
| qsize = len(self.crawlers[crawler_id]['queue']) | |
| if (current_qsize == None or current_qsize >= qsize): | |
| current_qsize = qsize | |
| current_crawler_id = crawler_id | |
| #logger.info('%s:%d >= %d?'%(crawler_id, current_qsize, qsize)) | |
| return current_crawler_id | |
| def persist_queues(self): | |
| cmds = {} | |
| for crawler_id in self.crawlers: | |
| cmds.update(self.crawlers[crawler_id]['queue']) | |
| with open('%s_queued_cmds.json'%(int(time.time())), 'wb') as f: | |
| json.dump(cmds, f) | |
| def remaining_tasks(self): | |
| qsizes = [len(self.crawlers[crawler_id]['queue']) for crawler_id in self.crawlers] | |
| return sum(qsizes) | |
| def distribute_to_nodes(self, queue): | |
| node_queues = {} | |
| def get_node_queue(node_id, redis_config): | |
| if (node_id in node_queues): | |
| node_queue = node_queues[node_id] | |
| else: | |
| node_queue = NodeQueue(node_id, redis_config=redis_config) | |
| node_queues[node_id] = node_queue | |
| qsizes = self.node_coordinator.node_qsizes() | |
| for cmd in queue.values(): | |
| node_id = get_keys_by_min_value(qsizes)[0] | |
| node_queue = get_node_queue(self, node_id) | |
| node_queue.put(cmd) | |
| qsizes[node_id] += 1 | |
| def enqueue(self, cmd): | |
| if (cmd['cmd'] == 'TERMINATE'): | |
| # note that we need to save both the queues that are local, but also let others know that i am dead, and I need to have my redis queue cleared out... (it's possible that another node is still trying to send data into my redis queue, after i am dead... this needs to be handled as maintenance job (take the cmds in dead nodes' queue, and persist...)) | |
| if (remaining_tasks > 0): | |
| self.persist_queues() | |
| [self.crawlers[crawler_id]['crawler'].enqueue(cmd) for crawler_id in self.crawlers] | |
| elif(cmd['cmd'] == 'CRAWLER_FLUSH'): | |
| [self.crawlers[crawler_id]['crawler'].enqueue(cmd) for crawler_id in self.crawlers] | |
| elif(cmd['cmd'] == 'CRAWLER_FAILED'): | |
| crawler_id = cmd['crawler_id'] | |
| if (crawler_id in self.crawlers): | |
| logger.warn('%s just failed... redistributing its workload'%(crawler_id)) | |
| try: | |
| self.distribute_to_nodes(self.crawlers[crawler_id]['queue']) | |
| # wait until it dies (flushed all the data...) | |
| while(self.crawlers[crawler_id]['crawler'].is_alive()): | |
| time.sleep(60) | |
| except Exception as exc: | |
| logger.error(full_stack()) | |
| else: | |
| logger.warn("whatever are you trying to do? crawler_id: [%s] is not valid..."%(crawler_id)) | |
| elif(cmd['cmd'] == 'CMD_FINISHED'): | |
| #acknowledged finished cmd | |
| try: | |
| crawler_id = cmd['crawler_id'] | |
| del self.crawlers[crawler_id]['queue'][cmd['cmd_hash']] | |
| logger.info('removed cmd: %s from [%s]'%(cmd['cmd_hash'], crawler_id)) | |
| except Exception as exc: | |
| logger.warn("the cmd doesn't exist? %s: %s"%(cmd['cmd_hash'], exc)) | |
| else: | |
| crawler_id = self.distribute_to() | |
| cmd_hash = hash_cmd(cmd) | |
| cmd['cmd_hash'] = cmd_hash | |
| self.crawlers[crawler_id]['queue'][cmd_hash] = cmd | |
| self.crawlers[crawler_id]['crawler'].enqueue(cmd) | |
| logger.debug("pushed %s: [%s] to crawler: %s"%(cmd, cmd_hash, crawler_id)) | |
| def check_local_qsizes(self): | |
| #logger.info(self.crawlers) | |
| return {crawler_id:len(self.crawlers[crawler_id]['queue']) for crawler_id in self.crawlers} | |
| def split(self, lst, n): | |
| """ Yield successive n chunks of even sized sub-lists from lst.""" | |
| lsize = {} | |
| results = {} | |
| for i in range(n): | |
| lsize[i] = 0 | |
| results[i] = [] | |
| for x in lst: | |
| idx = get_keys_by_min_value(lsize)[0] | |
| results[idx].append(x) | |
| lsize[idx] += 1 | |
| for i in range(n): | |
| yield results[i] | |