## Distributed Scraping:multiprocessing

In [9]:
import multiprocessing as mp
import time
from urllib import request
from bs4 import BeautifulSoup
import re

In [8]:
base_url = "https://artist.artron.net/class-0-0-0.html"
restricted_crawl = False
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}  

In [13]:
def crawl(url):   
    req = request.Request(url=url, headers=headers)  
    time.sleep(0.1)             # slightly delay for downloading
    return request.urlopen(req).read().decode('utf-8')

In [19]:
def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    title = soup.find('title').get_text().strip()
    page_urls = set([request.urljoin(base_url, url['href']) for url in urls])
    #url = soup.find('meta', {'property': "og:url"})['content']
    url = ""
    return title, page_urls, url

### Nomal way

In [20]:
unseen = set([base_url,])
seen = set()

count, t1 = 1, time.time()

processing_level = 0;

while len(unseen) != 0 or processing_level>2:                 # still get some url to visit or 最多爬去三层
    if restricted_crawl and len(seen) > 20:
            break
    processing_level +=1
    print("\nScraping the level:{}".format(processing_level))
    
    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]

    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 53 


Scraping the level:1

Distributed Crawling...

Distributed Parsing...

Analysing...
1 艺术家_大师-雅昌艺术家 

Scraping the level:2

Distributed Crawling...

Distributed Parsing...

Analysing...
2 艺术家资讯_雅昌艺术家 
3 李良东个人官网_个人作品推介、展示平台_雅昌艺术家网 
4 吴祖郊个人官网_个人作品推介、展示平台_雅昌艺术家网 
5 尚莹辉个人官网_个人作品推介、展示平台_雅昌艺术家网 
6 游新民个人官网_个人作品推介、展示平台_雅昌艺术家网 
7 刘小华个人官网_个人作品推介、展示平台_雅昌艺术家网 
8 薛丕显个人官网_个人作品推介、展示平台_雅昌艺术家网 
9 龙友个人官网_个人作品推介、展示平台_雅昌艺术家网 
10 杜仲个人官网_个人作品推介、展示平台_雅昌艺术家网 
11 陈流个人官网_个人作品推介、展示平台_雅昌艺术家网 
12 天堂元丰个人官网_个人作品推介、展示平台_雅昌艺术家网 
13 杨慧个人官网_个人作品推介、展示平台_雅昌艺术家网 
14 殷实个人官网_个人作品推介、展示平台_雅昌艺术家网 
15 张卉个人官网_个人作品推介、展示平台_雅昌艺术家网 
16 周刚个人官网_个人作品推介、展示平台_雅昌艺术家网 
17 梅一个人官网_个人作品推介、展示平台_雅昌艺术家网 
18 艺术家价值_雅昌艺术家 
19 顾玉红个人官网_个人作品推介、展示平台_雅昌艺术家网 

Scraping the level:3

Distributed Crawling...

Distributed Parsing...

Analysing...
20 李良东_个人作品推介、展示平台_雅昌艺术家网 
21 尚莹辉_个人作品推介、展示平台_雅昌艺术家网 
22 周刚市场行情_周刚官网_周刚作品推介_个人作品推介平台_雅昌艺术家网 
23 李良东相册_李良东官网_李良东作品推介_个人作品推介平台_雅昌艺术家网 
24 游新民市场行情_游新民官网_游新民作品推介_个人作品推介平台_雅昌艺术家网 
25 尚莹辉_个人作品推介、展示平台_雅昌艺术家网 
26 张卉资讯_张卉个人官网

KeyboardInterrupt: 