### 百度百科爬取

In [181]:
#url管理器
#coding:utf-8
class UrlManager(object):
    def __init__(self):
        self.new_urls=set()
        self.old_urls=set()
    def has_new_url(self):
        return self.new_url_size()!=0
    def get_new_url(self):
        new_url=self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url
    def add_new_url(self,url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)
            
    def add_new_urls(self,urls):
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.add_new_url(url)
    def new_url_size(self):
        return len(self.new_urls)
    def old_url_size(self):
        return len(self.old_urls)
    #is ok

In [182]:
#html下载器
#coding:utf-8
import requests
class HtmlDownloader(object):
    def download(self,url):
        if url is None:
            return None
        user_agent='Mozilla/4.0(compatible;MSIE 5.5;Windows NT)'
        headers={'User-Agent':user_agent}
        r=requests.get(url,headers=headers)
        if r.status_code==200:
            r.encoding='utf-8'
            return r.text
        return None
            #is ok

In [183]:
#html解析器
#coding:utf-8
import re 
import urllib.parse as pa
from bs4 import BeautifulSoup
class HtmlParser(object):
    def parser2(self,page_url,html_cont):
        '''
        :param page_url: 下载页面的url
        :param html_cont: 下载页面的网页内容
        :return:返回url和数据
        '''
        if page_url is None or html_cont is None:
            return
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='UTF-8')
#         print(soup)
        new_urls=self._get_new_urls(page_url,soup)
        new_data=self._get_new_data(page_url,soup)
        return new_urls,new_data
    
    def _get_new_urls(self,page_url,soup):
        '''
        抽取新的url集合
        :param page_url: 下载页面的url
        :param soup;soup
        :return:返回新的URL集合
        '''
        new_urls=set()
        #抽取符合要求的a标记
        links=soup.find_all('a',href=re.compile(r'\/item\/[0-9A-Za-z%]{1,100}\/[0-9]{0,100}'))
#         return links
#         print(links)
        for link in links:
            #提前href属性
            new_url=link['href']
#             print(links)
            #拼接成完整网站
#             print(page_url,'****',new_url)
            page_url='https://baike.baidu.com'
            new_full_url=pa.urljoin(page_url,new_url)
#             print(new_full_url)
            new_urls.add(new_full_url)
        return new_urls
    def _get_new_data (self,page_url,soup):
        '''
        抽取有效数据
        :param page_url: 下载页面的url
        :param soup:
        :return:返回有效数据
        '''
        data={}
        data['url']=page_url
        title=soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
        data['title']=title.get_text()
        summary=soup.find('div',class_='lemma-summary')
        data['summary']=summary.get_text()
        
        return data
    #?

In [190]:
#数据存储器
#coding:utf-8
import codecs
class DataOutput(object):
    def __init__(self):
        self.datas=[]
    def store_data(self,data):
        if data is None:
            return
        m={}
        for k in data:
            p=data[k].replace(u'\xa0', u' ')
            m.update({k:p})
        self.datas.append(m)
#         print(data)
        
    def output_html(self):
        fout=codecs.open('baike.html','w',encoding='gbk')
        fout.write('<html>')
        fout.write('<body>')
        fout.write('<table>')
        for data in self.datas:
            fout.write('<tr>')
            fout.write('<td>%s</td>'%data['url'])
            fout.write('<td>%s</td>'%data['title'])
            fout.write('<td>%s</td>'%data['summary'])
            fout.write('<tr>')
            self.datas.remove(data)
        fout.write('<table>')
        fout.write('<body>')
        fout.write('<html>')
        fout.close()     
        #is ok

In [191]:
class SpiderMan(object):
    def __init__(self):
        self.manager=UrlManager()
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=DataOutput()
    def crawl(self,root_url):
        #添加入口url
        self.manager.add_new_url(root_url)
        #判断url管理器 中是否有新的url，同时判断抓取了多少个url
        while (self.manager.has_new_url() and self.manager.old_url_size()<100):
#             try:
                #获取新的url
            new_url=self.manager.get_new_url()
                #下载网页
            html=self.downloader.download(new_url)
            print(new_url)
#             print(html)
                #抽取网页数据
            new_urls,data=self.parser.parser2(new_url,html)
#                 print(new_urls)
#                 print(data)
#             print(data)
                #添加到url管理器
            self.manager.add_new_urls(new_urls)
                #存储文件
            self.output.store_data(data)
            print('已经抓起%s个连接'%self.manager.old_url_size())
#             except Exception:
#                 print('crawl fail')
        self.output.output_html()

        
if __name__=='__main__':
    spider_man=SpiderMan()
    spider_man.crawl('https://baike.baidu.com/view/284853.htm')

https://baike.baidu.com/view/284853.htm
已经抓起1个连接




https://baike.baidu.com/item/%E8%A0%95%E8%99%AB/4454380
已经抓起2个连接
https://baike.baidu.com/item/%E4%B8%87%E7%BB%B4%E7%BD%91/215515
已经抓起3个连接
https://baike.baidu.com/item/%E8%A0%95%E8%99%AB/21038#viewPageContent
已经抓起4个连接
https://baike.baidu.com/item/www/12031376#viewPageContent
已经抓起5个连接
https://baike.baidu.com/item/www/20192985#viewPageContent
已经抓起6个连接
https://baike.baidu.com/item/%E5%85%89/18674514
已经抓起7个连接
https://baike.baidu.com/item/%E5%85%89/22230849#viewPageContent
已经抓起8个连接
https://baike.baidu.com/item/%E8%9C%98%E8%9B%9B/8135707
已经抓起9个连接
https://baike.baidu.com/item/%E5%85%89/18484512#viewPageContent
已经抓起10个连接
https://baike.baidu.com/item/%E8%9C%98%E8%9B%9B/20183366#viewPageContent
已经抓起11个连接
https://baike.baidu.com/item/%E5%85%89/19494073#viewPageContent
已经抓起12个连接
https://baike.baidu.com/item/%E5%85%89/13832884#viewPageContent
已经抓起13个连接
https://baike.baidu.com/item/Butterfly/13002506
已经抓起14个连接
https://baike.baidu.com/item/Butterfly/21285078#viewPageContent
已经抓起15个连接
https://baike.bai