# 2.1 针对任意多的分隔符拆分字符串

In [1]:
line = 'asdf fjdk; afed, fjek,asdf,    foo'
import re
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [3]:
fields = re.split(r'(;|,|\s)\s*', line)
fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [4]:
values = fields[::2]
delimiters = fields[1::2] + ['']
print(values)
print(delimiters)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
[' ', ';', ',', ',', ',', '']


In [5]:
# Reform the line using the same delimiters
''.join(v+d for v,d in zip(values, delimiters))

'asdf fjdk;afed,fjek,asdf,foo'

In [6]:
re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

# 2.2 在字符串的开头或结尾处做文本匹配

In [9]:
filename = 'spam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))

url = 'http://www.python.org'
print(url.startswith('http:'))

True
False
True


In [10]:
import os 
filenames = os.listdir('.')
print(filenames)

['.ipynb_checkpoints', '第一章 数据结构和算法.ipynb', '第二章 字符串和文本.ipynb']


In [11]:
[name for name in filename if name.endswith(('.c', 'h'))]

[]

In [12]:
any(name.endswith('.py') for name in filenames)

False

In [15]:
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

In [18]:
choices = ['http:', 'ftp:']
url = 'http:www.python.org'
# print(url.startswith(choices))
print(url.startswith(tuple(choices)))

True


In [19]:
filename = 'spam.txt'
filename[-4:] == '.txt'

True

In [20]:
url = 'http://www.python.org'
re.match('http:|https:|ftp:', url)


<re.Match object; span=(0, 5), match='http:'>

# 2.3 利用Shell通配符做字符串匹配

In [22]:
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
print([name for name in names if fnmatch(name, 'Dat*.csv')])

True
True
True
['Dat1.csv', 'Dat2.csv']


In [23]:
fnmatch('foo.txt', '*.TXT')

True

In [24]:
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]

In [25]:
from fnmatch import fnmatchcase
print([addr for addr in addresses if fnmatchcase(addr, '*ST')])
print([addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')])

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
['5412 N CLARK ST']


# 2.4 文本模式的匹配和查找

In [27]:
text = 'yeah, but no, but yeah, but no, but yeah'

# Exact match
print(text == 'yeah')

# Match at start or end
print(text.startswith('yeah'))
print(text.endswith('no'))

# Search for the location of the first orrurrence
print(text.find('no'))

False
True
False
10


In [28]:
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

In [29]:
import re 

# Simple matching: \d+ means match one or more digits
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')
    
if re.match(r'\d+/\d+/\d+', text2):
    print('yes')
else:
    print('no')

yes
no


In [32]:
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')
    
if datepat.match(text2):
    print('yes')
else:
    print('no')

yes
no


In [33]:
text = 'Today is 11/27/2012, PyCon starts 3/13/2013'
datepat.findall(text)

['11/27/2012', '3/13/2013']