## 使用LinkExtractor提取链接

In [3]:
# 构造函数
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
             tags=('a', 'area'), attrs=('href',), canonicalize=False,
             unique=True, process_value=None, deny_extensions=None, restrict_css=(),
             strip=True):
    pass

- allow 接收一个正则表达式或一个正则表达式列表，提取url与正则表达式匹配的链接。
- deny 接收一个正则表达式或一个正则表达式列表，排除url与正则表达式匹配的链接。
- allow_domains 接收一个域名或一个域名列表，提取到指定域的链接。
- deny_domains 接收一个域名或一个域名列表，排除指定域的链接。
- restrict_xpaths 接收一个xpath表达式，提取xpath表达式选中区域下的链接。
- restrict_css 接收一个css表达式，提取css表达式选中区域下的链接。
- tags 接收一个标签或一个标签列表，提取指定标签内的链接。
- attrs 接收一个属性或一个属性列表，提取指定属性内的链接。
- process_value 接收一个形如func(value)的回调函数。如果传递了该参数，LinkExtrator将调用该回调函数对提取的每一个链接进行处理，回调函数正常情况下应返回一个字符串，想要抛弃所处理的链接时，返回None。

In [4]:
# 方法extract_links
def extract_links(self, response):
    base_url = get_base_url(response)
    if self.restrict_xpaths:
        docs = [subdoc
                for x in self.restrict_xpaths
                for subdoc in response.xpath(x)]
    else:
        docs = [response.selector]
    all_links = []
    for doc in docs:
        links = self._extract_links(doc, response.url, response.encoding, base_url)
        all_links.extend(self._process_links(links))
    return unique_list(all_links)

传入response对象后，自动生成绝对url的list。

In [15]:
'''
<!--example1.html-->
<html>
    <body>
        <div id="top">
            <p>下面是一些站内链接</p>
            <a class="internal" href="/intro/install.html">Installation guide</a>
            <a class="internal" href="/intro/tutorial.html">Tutorial</a>
            <a class="internal" href="../examples.html">Examples</a>            
        </div>
        <div id="bottom">
            <p>下面是一些站外链接</p>
            <a href="http://stackoverflow.com/tags/scrapy/info">StackOverflow</a>
            <a href="https://github.com/scrapy/scrapy">Fork on Github</a>         
        </div>
    </body>
</html>
<!--example2.html-->
<html>
    <head>
        <script type='text/javascript' src='/js/app1.js'/>
        <script type='text/javascript' src='/js/app2.js'/>
    </head>
    <body>
        <div>
            <a href="/home.html">主页</a>
            <a href="javascript:goToPage('/doc.html');return false">文档</a>
            <a href="javascript:goToPage('/example.html');return false">案例</a>
            </div>
    </body>
</html>
'''

'\n<!--example1.html-->\n<html>\n    <body>\n        <div id="top">\n            <p>下面是一些站内链接</p>\n            <a class="internal" href="/intro/install.html">Installation guide</a>\n            <a class="internal" href="/intro/tutorial.html">Tutorial</a>\n            <a class="internal" href="../examples.html">Examples</a>            \n        </div>\n        <div id="bottom">\n            <p>下面是一些站外链接</p>\n            <a href="http://stackoverflow.com/tags/scrapy/info">StackOverflow</a>\n            <a href="https://github.com/scrapy/scrapy">Fork on Github</a>         \n        </div>\n    </body>\n</html>\n<!--example2.html-->\n<html>\n    <head>\n        <script type=\'text/javascript\' src=\'/js/app1.js\'/>\n        <script type=\'text/javascript\' src=\'/js/app2.js\'/>\n    </head>\n    <body>\n        <div>\n            <a href="/home.html">主页</a>\n            <a href="javascript:goToPage(\'/doc.html\');return false">文档</a>\n            <a href="javascript:goToPage(\'/example.htm

In [19]:
from scrapy.http import HtmlResponse
html1=open('./example1.html',encoding='utf-8').read()
html2=open('./example2.html',encoding='utf-8').read()
response1=HtmlResponse(url='http://example1.com',body=html1,encoding='utf8')
response2=HtmlResponse(url='http://example2.com',body=html2,encoding='utf8')

In [21]:
from scrapy.linkextractors import LinkExtractor
le=LinkExtractor() # new LinkExtractor对象
links=le.extract_links(response1) # response对象中提取links
[link.url for link in links]

['http://example1.com/intro/install.html',
 'http://example1.com/intro/tutorial.html',
 'http://example1.com/examples.html',
 'http://stackoverflow.com/tags/scrapy/info',
 'https://github.com/scrapy/scrapy']

In [25]:
# LinkExtractor构造函数使用allow字段
le=LinkExtractor(allow='/intro/.+\.html$')
links=le.extract_links(response1)
[link.url for link in links]

['http://example1.com/intro/install.html',
 'http://example1.com/intro/tutorial.html']

In [27]:
# LinkExtractor构造函数使用deny字段
le=LinkExtractor(deny='\.html$')
links=le.extract_links(response1)
[link.url for link in links]

['http://stackoverflow.com/tags/scrapy/info',
 'https://github.com/scrapy/scrapy']

In [28]:
# LinkExtractor构造函数使用allow_domains字段
domains=['github.com','stackoverflow.com']
le=LinkExtractor(allow_domains=domains)
links=le.extract_links(response1)
[link.url for link in links]

['http://stackoverflow.com/tags/scrapy/info',
 'https://github.com/scrapy/scrapy']

In [29]:
# LinkExtractor构造函数使用deny_domains字段
domains=['github.com','stackoverflow.com']
le=LinkExtractor(deny_domains=domains)
links=le.extract_links(response1)
[link.url for link in links]

['http://example1.com/intro/install.html',
 'http://example1.com/intro/tutorial.html',
 'http://example1.com/examples.html']

In [30]:
# LinkExtractor构造函数使用restrict_xpath字段
le=LinkExtractor(restrict_xpaths='//div[@id="top"]')
links=le.extract_links(response1)
[link.url for link in links]

['http://example1.com/intro/install.html',
 'http://example1.com/intro/tutorial.html',
 'http://example1.com/examples.html']

In [32]:
# LinkExtractor构造函数使用restrict_css字段
le=LinkExtractor(restrict_css='div#bottom')
links=le.extract_links(response1)
[link.url for link in links]

['http://stackoverflow.com/tags/scrapy/info',
 'https://github.com/scrapy/scrapy']

In [33]:
# LinkExtractor构造函数使用tags,attrs字段
le=LinkExtractor(tags='script',attrs='src')
links=le.extract_links(response2)
[link.url for link in links]

['http://example2.com/js/app1.js', 'http://example2.com/js/app2.js']

In [34]:
# LinkExtractor构造函数使用process_value字段

import re
def process(value):
    m=re.search("javascript:goToPage\'(.*?)'",value)
    if m:
        value=m.group(1)
    return value

le=LinkExtractor(process_value=process)
links=le.extract_links(response2)
[link.url for link in links]

['http://example2.com/home.html']