In [3]:
import re
import time
import requests
import tldextract
from bs4 import BeautifulSoup
import cchardet
import traceback
import leveldb

In [4]:
import pickle
import urllib.parse as urlparse

In [13]:
def save_to_db(url,html):
    print('{url} : {num}'.format(url=url,num=len(html)))

In [16]:
def crawl():
    url = 'http://news.baidu.com/'
    res = requests.get(url)
    html = res.text
    bf = BeautifulSoup(html)
    
    links = re.findall(r'href=[\'"]?(.*?)[\'"\s]',html)
    print('find links:',len(links))
    news_links = []
    
    for link in links:
        if not link.startswith('http'):
            continue
        tld = tldextract.extract(link)
        if tld.domain == 'baidu':
            continue
        news_links.append(link)
    print('find news links:',len(news_links))
    
    for link in news_links:
        html = requests.get(link).text
        save_to_db(link,html)
    print('works done!')

In [19]:
def main():
    while 1:
        crawl()
        time.sleep(10)

In [32]:
def downloader(url,timeout=10,headers=None,debug=False,binary=False):
    _headers = {
        "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
        
    }
    redirected_url = url
    if headers:
        _headers = headers
    try:
        r = requests.get(url,headers=_headers,timeout=timeout)
        if binary:
            html = r.content
        else:
            encoding = cchardet.detect(r.content)['encodig']
            html = r.content.decode(encoding)
        status = r.status_code
        redirected_url = r.url
    except:
        if debug:
            traceback.print_exc()
        msg = 'failed download: {}'.format(url)
        print(msg)
        if binary:
            html = b''
        else:
            html = ''
        status = 0
    return status,html,redirected_url

In [33]:
g_bin_postfix = set([
    'exe', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
    'pdf',
    'jpg', 'png', 'bmp', 'jpeg', 'gif',
    'zip', 'rar', 'tar', 'bz2', '7z', 'gz',
    'flv', 'mp4', 'avi', 'wmv', 'mkv',
    'apk',
])

g_news_postfix = [
    '.html?', '.htm?', '.shtml?',
    '.shtm?',
]


def clean_url(url):
    # 1. 是否为合法的http url
    if not url.startswith('http'):
        return ''
    # 2. 去掉静态化url后面的参数
    for np in g_news_postfix:
        p = url.find(np)
        if p > -1:
            p = url.find('?')
            url = url[:p]
            return url
    # 3. 不下载二进制类内容的链接
    up = urlparse.urlparse(url)
    path = up.path
    if not path:
        path = '/'
    postfix = path.split('.')[-1].lower()
    if postfix in g_bin_postfix:
        return ''

    # 4. 去掉标识流量来源的参数
    # badquery = ['spm', 'utm_source', 'utm_source', 'utm_medium', 'utm_campaign']
    good_queries = []
    for query in up.query.split('&'):
        qv = query.split('=')
        if qv[0].startswith('spm') or qv[0].startswith('utm_'):
            continue
        if len(qv) == 1:
            continue
        good_queries.append(query)
    query = '&'.join(good_queries)
    url = urlparse.urlunparse((
        up.scheme,
        up.netloc,
        path,
        up.params,
        query,
        ''  #  crawler do not care fragment
    ))
    return url

In [37]:
class UrlDB(object):
    status_failure = b'0'
    status_success = b'1'
    
    def __init__(self,db_name):
        self.name = db_name + '.urldb'
        self.db = leveldb.LevelDB(self.name)
        
    def set_success(self,url):
        if isinstance(url,str):
            url = url.encode('utf8')
        try:
            self.db.Put(url,self.status_success)
            s = True
        except:
            s = False
        return s
    
    def set_failure(self,url):
        if isinstance(url,str):
            url = url.encode('utf8')
        try:
            self.db.Put(url,self.status_failure)
            s = True
        except:
            s = False
        return s
    
    def has(self,url):
        if isinstance(url,str):
            url = url.encode('utf8')
        try:
            attr = self.db.Get(url)
            return attr
        except:
            pass
        return False

In [1]:
class UrlPool(object):
    def __init__(self,pool_name):
        self.name = pool_name
        self.db = UrlDB(pool_name)
        self.waiting = {}
        self.pending = {}
        self.failure = {}
        self.failure_threshold = 3
        self.pending_threshold = 10
        self.waiting_count = 0
        self.max_hosts = ['',0]
        self.hub_pool = {}
        self.hub_refresh_span = 0
        self.load_cache()
        
    def __del__(self):
        self.dump_cache()
        
    def load_cache(self):
        path = self.name + '.pkl'
        try:
            with open(path,'rb') as f:
                self.waiting = pickle.load(f)
            cc = [len(v) for k,v in self.waiting.items()]
            print('saved pool loaded! urls',sum(cc))
        except:
            pass
        
    def dump_cache(self):
        path = self.name + '.pkl'
        try:
            with open(path,'wb') as f:
                pickle.dump(self.waiting,f)
            print('self.waiting saved!')
        except:
            pass
        
    def set_hubs(self,urls,hub_refresh_span):
        self.hub_refresh_span = hub_refresh_span
        self.hub_pool = {}
        for url in urls:
            self.hub_pool[url] = 0
            
    def set_status(self,url,status_code):
        if url in self.pending:
            self.pending.pop(url)
            
        if status_code == 200:
            self.db.set_success(url)
            return
        if status_code == 404:
            self.db.set_failure(url)
            return
        
        if url in self.failure:
            self.failure[url] += 1
            if self.failure[url] > self.failure_threshold:
                self.db.set_failure(url)
                self.failure.pop(url)
            else:
                self.add(url)
        else:
            self.failure[url] = 1
            self.add(url)
            
    def push_to_pool(self, url):
        host = urlparse.urlparse(url).netloc
        if not host or '.' not in host:
            print('try to push_to_pool with bad url:', url, ', len of ur:', len(url))
            return False
        if host in self.waiting:
            if url in self.waiting[host]:
                return True
            self.waiting[host].add(url)
            if len(self.waiting[host]) > self.max_hosts[1]:
                self.max_hosts[1] = len(self.waiting[host])
                self.max_hosts[0] = host
        else:
            self.waiting[host] = set([url])
        self.waiting_count += 1
        return True

    def add(self, url, always=False):
        if always:
            return self.push_to_pool(url)
        pended_time = self.pending.get(url, 0)
        if time.time() - pended_time < self.pending_threshold:
            print('being downloading:', url)
            return
        if self.db.has(url):
            return
        if pended_time:
            self.pending.pop(url)
        return self.push_to_pool(url)

    def addmany(self, urls, always=False):
        if isinstance(urls, str):
            print('urls is a str !!!!', urls)
            self.add(urls, always)
        else:
            for url in urls:
                self.add(url, always)

    def pop(self, count, hub_percent=50):
        print('\n\tmax of host:', self.max_hosts)

        # 取出的url有两种类型：hub=1, 普通=0
        url_attr_url = 0
        url_attr_hub = 1
        # 1. 首先取出hub，保证获取hub里面的最新url.
        hubs = {}
        hub_count = count * hub_percent // 100
        for hub in self.hub_pool:
            span = time.time() - self.hub_pool[hub]
            if span < self.hub_refresh_span: 
                continue
            hubs[hub] = url_attr_hub # 1 means hub-url 
            self.hub_pool[hub] = time.time() 
            if len(hubs) >= hub_count:
                break

        # 2. 再取出普通url
        left_count = count - len(hubs)
        urls = {}
        for host in self.waiting:
            if not self.waiting[host]:
                continue
            url = self.waiting[host].pop()
            urls[url] = url_attr_url
            self.pending[url] = time.time()
            if self.max_hosts[0] == host:
                self.max_hosts[1] -= 1
            if len(urls) >= left_count:
                break
        self.waiting_count -= len(urls)
        print('To pop:%s, hubs: %s, urls: %s, hosts:%s' % (count, len(hubs), len(urls), len(self.waiting)))
        urls.update(hubs)
        return urls

    def size(self,):
        return self.waiting_count

    def empty(self,):
        return self.waiting_count == 0