### re.match

re.match(pattern,string,flags=0)

### 最常规的匹配

In [1]:
import re

content='Hello 123 4567 World_This is a Regex Demo'
print(len(content))
result=re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$',content)
print(result)
print(result.group())
print(result.span())

41
<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


### 范匹配

In [2]:
import re

content='Hello 123 4567 World_This is a Regex Demo'
result=re.match('^Hello.*Demo$',content)
print(result)
print(result.group())
print(result.span())

<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>
Hello 123 4567 World_This is a Regex Demo
(0, 41)


### 匹配目标

In [4]:
import re

content='Hello 1234567 World_This is a Regex Demo'
result=re.match('^Hello\s(\d+)\s.*Demo$', content)
print(result)
print(result.group())
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
Hello 1234567 World_This is a Regex Demo
1234567
(0, 40)


### 贪婪匹配

In [5]:
import re

content='Hello 1234567 World_This is a Regex Demo'
result=re.match('^He.*(\d+).*Demo$', content)
print(result)
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
7
(0, 40)


### 非贪婪匹配

In [6]:
import re

content='Hello 1234567 World_This is a Regex Demo'
result=re.match('^He.*?(\d+).*Demo$', content)
print(result)
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
(0, 40)


### 匹配模式（换行符不能用.代替）

In [12]:
import re

content='''Hello 1234567 World_This 
is a Regex Demo
'''
result=re.match('^He.*?(\d+).*?Demo$', content, re.S)
print(result)
print(result.group(1))
print(result.span())

<_sre.SRE_Match object; span=(0, 41), match='Hello 1234567 World_This \nis a Regex Demo'>
1234567
(0, 41)


### 转义

In [13]:
import re

content = 'price is $5.00'
result= re.match('price is $5.00',content)
print(result)

None


In [14]:
import re

content = 'price is $5.00'
result= re.match('price is \$5\.00',content)
print(result)

<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>


总结：尽量使用范匹配、使用括号得到匹配目标、尽量使用非贪婪模式、有换行符就用re.S

### re.search

re.search扫描整个字符串并返回第一个成功的匹配

In [15]:
import re

content='Extra strings Hello 1234567 world_This is a Regex Demo Extra stings'
result=re.match('Hello.*?(\d+).*?Demo', content)
print(result)

None


In [16]:
import re

content='Extra strings Hello 1234567 world_This is a Regex Demo Extra stings'
result=re.search('Hello.*?(\d+).*?Demo', content)
print(result)

<_sre.SRE_Match object; span=(14, 54), match='Hello 1234567 world_This is a Regex Demo'>


总结：为匹配方便，能用search不用match

### 匹配演练

In [23]:
import re

html='''<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">indienova 独立游戏</span>
<h2><a class="post-link" target="_blank" href="https://zhuanlan.zhihu.com/p/32336267" data-za-element-name="Title">indienova选出的2017十大独立游戏</a></h2>
</div>

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">问答</span>
<h2><a class="question_link" href="/question/264982351/answer/288916910" target="_blank" data-id="20424165" data-za-element-name="Title">
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？
</a></h2>
</div>
'''
result=re.search('<h2.*?element.*?>(.*?)</a>',html,re.S)
if result:
    print(result.group(1))

indienova选出的2017十大独立游戏


### re.findall

搜索字符串，以列表形式返回全部能匹配的子串

In [22]:
import re

html='''<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">indienova 独立游戏</span>
<h2><a class="post-link" target="_blank" href="https://zhuanlan.zhihu.com/p/32336267" data-za-element-name="Title">indienova选出的2017十大独立游戏</a></h2>
</div>

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">问答</span>
<h2><a class="question_link" href="/question/264982351/answer/288916910" target="_blank" data-id="20424165" data-za-element-name="Title">
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？
</a></h2>
</div>
'''
result=re.findall('<h2.*?class="(.*?)".*?href="(.*?)".*?</a>',html,re.S)
print(result)
print(type(result))
for r in result:
    print(r)
    print(r[0],r[1])

[('post-link', 'https://zhuanlan.zhihu.com/p/32336267'), ('question_link', '/question/264982351/answer/288916910')]
<class 'list'>
('post-link', 'https://zhuanlan.zhihu.com/p/32336267')
post-link https://zhuanlan.zhihu.com/p/32336267
('question_link', '/question/264982351/answer/288916910')
question_link /question/264982351/answer/288916910


In [30]:
import re

html='''<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">indienova 独立游戏</span>
<h2><a class="post-link" target="_blank" href="https://zhuanlan.zhihu.com/p/32336267" data-za-element-name="Title">indienova选出的2017十大独立游戏</a></h2>
</div>

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">问答</span>
<h2><a class="question_link" href="/question/264982351/answer/288916910" target="_blank" data-id="20424165" data-za-element-name="Title">
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？
</a></h2>
</div>
'''
result=re.findall('<h2>.*?class="(.*?)".*?href="(.*?)".*?"Title">(.*?)\s*?</a></h2>',html,re.S)
print(result)
for r in result:
    print(r[2])

[('post-link', 'https://zhuanlan.zhihu.com/p/32336267', 'indienova选出的2017十大独立游戏'), ('question_link', '/question/264982351/answer/288916910', '\n氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？')]
indienova选出的2017十大独立游戏

氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？


### re.sub

替换字符串中每个匹配的子串后返回替换后的字符串

In [1]:
import re

content='Extra hello world 1234567 extra demo'
content=re.sub('\d+','',content)
print(content)

Extra hello world  extra demo


In [2]:
import re

content='Extra hello world 1234567 extra demo'
content=re.sub('\d+','replacement',content)
print(content)

Extra hello world replacement extra demo


In [3]:
import re

content='Extra hello world 1234567 Extra demo'
content=re.sub('(\d+)',r'\1 8910',content)
print(content)

Extra hello world 1234567 8910 Extra demo


\1表示（\d+）

In [9]:
import re

html='''<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">indienova 独立游戏</span>
<h2><a class="post-link" target="_blank" href="https://zhuanlan.zhihu.com/p/32336267" data-za-element-name="Title">indienova选出的2017十大独立游戏</a></h2>
</div>

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">问答</span>
<h2><a class="question_link" href="/question/264982351/answer/288916910" target="_blank" data-id="20424165" data-za-element-name="Title">
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？
</a></h2>
</div>
'''

html=re.sub('<h.*？>|</h2>','',html)
print(html)
result=re.findall('<a.*?class=".*?".*?href=".*?".*?"Title">(.*?)\s*?</a>',html,re.S)
print(result)
for r in result:
    print(r.strip())

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">indienova 独立游戏</span>
<h2><a class="post-link" target="_blank" href="https://zhuanlan.zhihu.com/p/32336267" data-za-element-name="Title">indienova选出的2017十大独立游戏</a>
</div>

<div class="recommend-feed feed-item">
<span class="zg-right zg-gray-normal feed-meta">问答</span>
<h2><a class="question_link" href="/question/264982351/answer/288916910" target="_blank" data-id="20424165" data-za-element-name="Title">
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？
</a>
</div>

['indienova选出的2017十大独立游戏', '\n氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？']
indienova选出的2017十大独立游戏
氢键是否可以看作分子间作用力（范德华力）向化学键的过渡？


### re.compile

将一个正则表达式串编译成正则对象，以便于复用该匹配对象

In [3]:
import re

content= '''Hello 1234567 World_This 
is a Regex Demo
'''
pattern = re.compile('Hello.*?Demo',re.S)
result1 = re.match(pattern, content)
print(result1)

<_sre.SRE_Match object; span=(0, 41), match='Hello 1234567 World_This \nis a Regex Demo'>


### 实战练习

In [None]:
import requests
import re

content= requests.get('https://book.douban.com/').text
pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</div>.*?year">(.*?)</span>.*?</li>',re.S)
results=re.findall(pattern,content)
print(results)
for result in results:
    url,name,author,date = result
    print(url,name,author,date)

In [2]:
import requests
import re

content= requests.get('https://book.douban.com/').text
results=re.search('<li.*?cover.*?title="(.*?)".*?</li>',content,re.S)

In [1]:
html='''<li class="">
            <div class="cover">
              <a href="https://book.douban.com/subject/27589516/?icn=index-editionrecommend" title="长长的路 我们慢慢走">
                <img src="https://img3.doubanio.com/lpic/s29618085.jpg" class="" width="115px" height="172px" alt="长长的路 我们慢慢走">
              </a>
            </div>
                <div class="intervenor-info">
                    <img src="https://img3.doubanio.com/f/book/ef040178fab1770d60e3f2f12ba4c7aa70714396/pics/book/partner/jd_recommend.png" class="jd-icon" width="16" height="16"> 
                    <span>推荐</span>
                </div>
            <div class="info">
              <div class="title">
                <a class="" href="https://book.douban.com/subject/27589516/?icn=index-editionrecommend" title="长长的路 我们慢慢走">长长的路 我们慢慢走</a>
              </div>
              <div class="author">
                余光中
              </div>
              <div class="more-meta">
                <h4 class="title">
                  长长的路 我们慢慢走
                </h4>
                <p>
                  <span class="author">
                    余光中
                  </span>
                  /
                  <span class="year">
                    2017-12-1
                  </span>
                  /
                  <span class="publisher">
                    光明日报出版社
                  </span>
                </p>
                <p class="abstract">
                  
                  本书精选余光中先生散文36篇，包括游记见闻、感情经历、生活智慧、人情世故、文化随感这五部分内容。
在这本书里余光中先生以人生过来人的姿态，为每一个读者提供生命的启示和前行的力量。
物质支撑人的生活，而信念支撑人的灵魂。人生这趟旅途，“去向远方”是每个人生命中最浪漫的冲动，也是每个人对抗孤独与现实的力量之源。
我们的生命，短暂却又漫长，短暂的是外...
                </p>
              </div>
            </div>
          </li>
'''
import requests
import re

results=re.findall('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',html,re.S)
print(results)


[('https://book.douban.com/subject/27589516/?icn=index-editionrecommend', '长长的路 我们慢慢走', '\n                    余光中\n                  ', '\n                    2017-12-1\n                  ')]
