forked from scrapinghub/scrapylib
/
hubproxy.py
56 lines (46 loc) · 1.92 KB
/
hubproxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from w3lib.http import basic_auth_header
from scrapy.xlib.pydispatch import dispatcher
from scrapy import log, signals
class HubProxyMiddleware(object):
url = 'http://proxy.scrapinghub.com:8010'
maxbans = 20
ban_code = 503
download_timeout = 1800
@classmethod
def from_crawler(cls, crawler):
o = cls()
o.crawler = crawler
dispatcher.connect(o.open_spider, signals.spider_opened)
return o
def open_spider(self, spider):
self.enabled = self.is_enabled(spider)
if not self.enabled:
return
for k in ('user', 'pass', 'url', 'maxbans', 'download_timeout'):
o = getattr(self, k, None)
s = self.crawler.settings.get('HUBPROXY_' + k.upper(), o)
v = getattr(spider, 'hubproxy_' + k, s)
setattr(self, k, v)
self._bans = 0
self._proxyauth = self.get_proxyauth(spider)
log.msg("Using hubproxy at %s (user: %s)" % (self.url, self.user), spider=spider)
def is_enabled(self, spider):
"""Hook to enable middleware by custom rules"""
return getattr(spider, 'use_hubproxy', False) \
or self.crawler.settings.getbool("HUBPROXY_ENABLED")
def get_proxyauth(self, spider):
"""Hook to compute Proxy-Authorization header by custom rules"""
return basic_auth_header(self.user, getattr(self, 'pass'))
def process_request(self, request, spider):
if self.enabled:
request.meta['proxy'] = self.url
request.meta['download_timeout'] = self.download_timeout
request.headers['Proxy-Authorization'] = self._proxyauth
def process_response(self, request, response, spider):
if response.status == self.ban_code:
self._bans += 1
if self._bans > self.maxbans:
self.crawler.engine.close_spider(spider, 'banned')
else:
self._bans = 0
return response