# Pattern Matching with Regular Expressions

## Finding Patterns of Text Without Regular Expressions

In [1]:
def is_phone_number(s):
    if len(s) != 12:
        return False
    if not (s[3] == s[7] == '-'):
        return False
    for i in s[:3]:
        if not i.isdigit():
            return False
    for i in s[4:7]:
        if not i.isdigit():
            return False
    for i in s[8:]:
        if not i.isdigit():
            return False
    return True

In [2]:
s = '415-555-4242'
is_phone_number(s)

True

In [3]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if is_phone_number(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


## Finding Patterns of Text with Regular Expressions

In [4]:
import re
from pprint import pprint

### Regex Objects

In [5]:
def is_phone_number_re(s):
    p = re.compile(r'\d{3}-\d{3}-\d{4}')
    # res = re.findall(p, s)
    res = p.findall(s)
    return list(res)

In [6]:
res = is_phone_number_re(message)
pprint(res)

['415-555-1011', '415-555-9999']


### re.search()

In [7]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search(s)
print('Phone number found: ' + mo.group(0))

Phone number found: 415-555-4242


### re.group() & re.groups()

In [8]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search(s)
print('Phone number found: ' + mo.group(0))
print('Phone number first: ' + mo.group(1))
print('Phone number second: ' + mo.group(2))

Phone number found: 415-555-4242
Phone number first: 415
Phone number second: 555-4242


In [9]:
pprint(mo.groups())

('415', '555-4242')


### Optional Matching with the Question Mark

In [10]:
batRegex = re.compile(r'Bat(wo)?man')

In [11]:
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [12]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

### Matching Zero or More with the Star

In [13]:
batRegex = re.compile(r'Bat(wo)*man')

In [14]:
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [15]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [16]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

### Matching One or More with the Plus

In [17]:
batRegex = re.compile(r'Bat(wo)+man')

In [18]:
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [19]:
mo2 = batRegex.search('The Adventures of Batwowowowoman')
mo2.group()

'Batwowowowoman'

In [20]:
mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

True

### ?*+ Summary

- \? zero or one
- \* zero or more
- \+ one or more

### Greedy and Nongreedy Matching

In [21]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [22]:
mo1.groups()

('Ha',)

In [23]:
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'

In [24]:
mo2.groups()

('Ha',)

### The findall() Method

In [25]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [26]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

### Character Classes

### Substituting Strings with the sub() Method

In [27]:
namesRegex = re.compile(r'Agent \w+')
agentNamesRegex = re.compile(r'Agent (\w)\w*')

In [28]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent '
                    'Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

## Question

In [29]:
p = re.compile(r'^\d{1,3}(,\d{3})*$')
s = '''42
1,234
123,234
6,368,745
12,34,567
1234
123456
'''

for each in s.split():
    if p.match(each):
        print('match:', each)
    else:
        print('not match:', each)

match: 42
match: 1,234
match: 123,234
match: 6,368,745
not match: 12,34,567
not match: 1234
not match: 123456


In [30]:
p = re.compile(r'^[A-Z][a-z]*\sNakamoto')

s = '''Satoshi Nakamoto
Alice Nakamoto
Robocop Nakamoto
satoshi Nakamoto
Mr. Nakamoto
Nakamoto
Satoshi nakamoto'''

for each in s.split('\n'):
    if p.match(each):
        print('match:', each)
    else:
        print('not match:', each)

match: Satoshi Nakamoto
match: Alice Nakamoto
match: Robocop Nakamoto
not match: satoshi Nakamoto
not match: Mr. Nakamoto
not match: Nakamoto
not match: Satoshi nakamoto


In [31]:
p = re.compile(r'(Alice|Bob|Carol)\s(eats|throws|pets)\s(apples|cats|baseballs)\.', re.I)

s = '''Alice eats apples.
Bob pets cats.
Carol throws baseballs.
Alice throws Apples.
BOB EATS CATS.
Robocop eats apples.
ALICE THROWS FOOTBALLS.
Carol eats 7 cats.'''

for each in s.split('\n'):
    if p.match(each):
        print('match:', each)
    else:
        print('not match:', each)

match: Alice eats apples.
match: Bob pets cats.
match: Carol throws baseballs.
match: Alice throws Apples.
match: BOB EATS CATS.
not match: Robocop eats apples.
not match: ALICE THROWS FOOTBALLS.
not match: Carol eats 7 cats.


## Practise

In [32]:
def strong_password_check(s):
    if is_strong(s):
        print("Strong password!")
    else:
        print('Not strong enough!')


def is_strong(s):
    if len(s) < 8:
        return False
    if (re.search(r'\d+', s) != None and
        re.search(r'[A-Z]+', s) != None and
        re.search(r'[a-z]+', s) != None):
        return True
    return False

In [33]:
pass1 = 'ASWDjpwijea3'
strong_password_check(pass1)

Strong password!


## Appendix

In [34]:
def func():
    """Do nothing"""
    print('do nothing')

In [35]:
dir(func)

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [36]:
class Test():
    """Simple test class"""
    def __init__(self):
        pass
    
    def func1(self):
        """func1"""
        pass
    
    def func2(self):
        """func2"""
        pass
    
    def func3(self):
        """func3"""
        pass

In [37]:
Test.__dict__.items()

dict_items([('func1', <function Test.func1 at 0x7f29b8341840>), ('func3', <function Test.func3 at 0x7f29b8341950>), ('__weakref__', <attribute '__weakref__' of 'Test' objects>), ('__dict__', <attribute '__dict__' of 'Test' objects>), ('func2', <function Test.func2 at 0x7f29b83418c8>), ('__init__', <function Test.__init__ at 0x7f29b83417b8>), ('__doc__', 'Simple test class'), ('__module__', '__main__')])

In [38]:
li = [x[1] for x in Test.__dict__.items() if not x[0].startswith('_')]
pprint(li)

[<function Test.func1 at 0x7f29b8341840>,
 <function Test.func3 at 0x7f29b8341950>,
 <function Test.func2 at 0x7f29b83418c8>]


In [39]:
for each in sorted(li, key=lambda x: x.__name__):
    print (each.__name__, each.__doc__)

func1 func1
func2 func2
func3 func3
