# Regular Expressions

In [1]:
import re

In [2]:
pat = re.compile(r'abc')
print(pat)
print(type(pat))

re.compile('abc')
<class 're.Pattern'>


In [3]:
mat_abc1 = pat.match('ABC,ABc,AbC,abc')
mat_abc2 = pat.match('abc,ABc,AbC,abc')
print(mat_abc1)
print(mat_abc2)

None
<re.Match object; span=(0, 3), match='abc'>


In [4]:
sear_abc1 = pat.search('ABC,ABc,AbC,abc')
sear_abc2 = pat.search('abc,ABc,AbC,abc')
print(sear_abc1)
print(sear_abc2)
print(type(sear_abc1))

<re.Match object; span=(12, 15), match='abc'>
<re.Match object; span=(0, 3), match='abc'>
<class 're.Match'>


In [5]:
find_abc1 = pat.findall('ABC,ABc,AbC,abc')
find_abc2 = pat.findall('abc,ABc,AbC,abc')
print(find_abc1)
print(find_abc2)

['abc']
['abc', 'abc']


In [6]:
finditer_abc = pat.finditer('abc,ABc,abc,abc')

print(finditer_abc)

for m in finditer_abc:
    print(m)
    


<callable_iterator object at 0x000002570A2551C0>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(8, 11), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


## Metacharacters

### Metacharacters that match one character

#### Dot (.)

In [7]:
# Dot exmaple
# ex1
p = re.compile(r'.')
m = p.findall('a1\nA#')
print(m)

['a', '1', 'A', '#']


In [8]:
# ex2
p = re.compile(r'.at')
m = p.findall('cat bat sat cap')
print(m)

['cat', 'bat', 'sat']


#### Character Class ([])

In [9]:
# Character Class example
# ex1
p = re.compile(r'[aA]')
m = p.findall('a1\nA#')
print(m)

['a', 'A']


In [10]:
# ex2
p = re.compile(r'[abcABC]')
m = p.findall('abcABC')
print(m)

['a', 'b', 'c', 'A', 'B', 'C']


In [11]:
# ex3: [a-zA-Z0-9]
p = re.compile(r'[a-z0-9]')
m = p.findall('d0A3z6P')
print(m)

# ex4: 
p = re.compile(r'[C-Eb-c2-4]')
m = p.findall('ABCDEF abcd 012345')
print(m)

# ex5:
p = re.compile(r'[-a-e]') # or [a-e-]
m = p.findall('e-a-s-y, easy')
print(m)


['d', '0', '3', 'z', '6']
['C', 'D', 'E', 'b', 'c', '2', '3', '4']
['e', '-', 'a', '-', '-', 'e', 'a']


In [12]:
# ex6: [^]
p = re.compile(r'[^0-9a-z]')
m = p.findall('1 2 3 Go')
print(m)

# ex7:
p = re.compile(r'[0-9^a-z]')
m = p.findall('1 2 3 ^Go')
print(m)

[' ', ' ', ' ', 'G']
['1', '2', '3', '^', 'o']


#### \d and \D



In [13]:
# \d and \D examples
# ex1
p = re.compile(r'\d')
m = p.findall('a1\nA#')
print("\d matches: ", m)

# ex2
p = re.compile(r'\D')
m = p.findall('a1\nA#')
print("\D matches: ", m)

\d matches:  ['1']
\D matches:  ['a', '\n', 'A', '#']


#### \w and \W

In [14]:
# \w and \W examples
# ex1
p = re.compile(r'\w')
m = p.findall('_#a!E$4-')
print("\w matches: ", m)

# ex2
p = re.compile(r'\W')
m = p.findall('_#a!E$4-')
print("\W matches: ", m)

\w matches:  ['_', 'a', 'E', '4']
\W matches:  ['#', '!', '$', '-']


#### \s and \S

In [15]:
# \s and \S examples
# ex1
text = 'Name\tISSS610\tISSS666\nJoe Jones\tA\tA\n'
print(text)

p = re.compile(r'\s')
m = p.findall(text)
print("\s matches: ", m)

# ex2
p = re.compile(r'\S')
m = p.findall(text)

print("\S matches: ", m)

Name	ISSS610	ISSS666
Joe Jones	A	A

\s matches:  ['\t', '\t', '\n', ' ', '\t', '\t', '\n']
\S matches:  ['N', 'a', 'm', 'e', 'I', 'S', 'S', 'S', '6', '1', '0', 'I', 'S', 'S', 'S', '6', '6', '6', 'J', 'o', 'e', 'J', 'o', 'n', 'e', 's', 'A', 'A']


### Escaping Character

#### Backslash (\\) as Escaping Character

In [16]:
# Escaping Character
# ex1
p1 = re.compile(r'.')
p2 = re.compile(r'\.')
m1 = p1.search('smu.edu.sg')
m2 = p2.search('smu.edu.sg')
print(m1, m2)

<re.Match object; span=(0, 1), match='s'> <re.Match object; span=(3, 4), match='.'>


In [17]:
# ex2
p = re.compile(r'\d\\d')
m = p.findall('135\d')
print(m)

['5\\d']


#### The Backslash Plague

In [18]:
# The Backslash Plague
# demo1

p = re.compile('\\')
m = p.findall('D:\Courses\Python')
print(m)

error: bad escape (end of pattern) at position 0

In [19]:
# The Backslash Plague
# demo1

print('\\')

\


In [20]:
# The Backslash Plague
# demo2

p = re.compile(r'\\')
m = p.findall('D:\Courses\Python')
print(m)

['\\', '\\']


### Anchors

#### Caret

In [21]:
# Caret
# ex1
p = re.compile(r'^ab.')
m = p.findall('abc abd abe abf')
print(m)

['abc']


In [22]:
# ex2
p = re.compile(r'^a[ab]c')
m = p.findall('''aac\nabc''')
print(m)

p = re.compile(r'^a[ab]c', re.M)
m = p.findall('''aac\nabc''')
print(m)

['aac']
['aac', 'abc']


#### Dollar(\$)

In [23]:
# Dollar
# ex1
p = re.compile(r'ab.$')
m = p.findall('abc abd abe abf')
print(m)

['abf']


In [24]:
# ex2
p = re.compile(r'[ab]c$')
m = p.findall('ac\nbc')
print(m)

p = re.compile(r'[ab]c$', re.M)
m = p.findall('ac\nbc')
print(m)

['bc']
['ac', 'bc']


#### Word Boundary: \b and \B

In [25]:
# Word Boundary:
# ex1
p = re.compile(r'\b\d\d\b')
m = p.findall('1 2 3 11 12 13 111 112 113')
print(m)

['11', '12', '13']


In [26]:
# ex2
p = re.compile(r'\b\w\w\b')
m = p.findall('aa,ab;ac(AA)AB AC')
print(m)

['aa', 'ab', 'ac', 'AA', 'AB', 'AC']


### Quantifiers

#### Asterisk (\*), Question Mark (?) and Plus (+)

In [27]:
# Asterisk, Question Mark and Plus
# ex1
p = re.compile(r'a[ab]*c')
m = p.findall('a ab ac abc aac aabc aaac ababc')
print(m)

p = re.compile(r'a[ab]+c')
m = p.findall('a ab ac abc aac aabc aaac ababc')
print(m)

p = re.compile(r'a[ab]?c')
m = p.findall('a ab ac abc aac aabc aaac ababc')
print(m)

['ac', 'abc', 'aac', 'aabc', 'aaac', 'ababc']
['abc', 'aac', 'aabc', 'aaac', 'ababc']
['ac', 'abc', 'aac', 'abc', 'aac', 'abc']


#### {m} and {m,n}

In [28]:
# {m} and {m,n}
# ex1
p = re.compile(r'\d{3}')
m = p.findall('1 2 3 11 12 13 111 112 113')
print(m)

p = re.compile(r'\d{2,3}')
m = p.findall('1 2 3 11 12 13 111 112 113')
print(m)

['111', '112', '113']
['11', '12', '13', '111', '112', '113']


### Grouping Constructs

#### Groups (())

In [32]:
# Groups
# ex1

p = re.compile(r'(\w+): (\d+)')
m = p.findall('Course: Grade\nMath: 89\nPhysics: 92\n English: 78')
print(m)

[('Math', '89'), ('Physics', '92'), ('English', '78')]


#### Quantifiers with Groups

In [60]:
# Quantifiers with Groups
# ex1

p = re.compile(r'(ha)+')
m = p.findall('ha hh aa hahahaha')
print(m)

# ex3
chapters = 'Chapter 12: Numpy\n\
Chapter 13: Pandas\n\
Chapter 14: Data Visualzation'
p = re.compile(r'^Chapter (\d+: .+)', re.M)
m = p.findall(chapters)
print(m)


['ha', 'ha']
['12: Numpy', '13: Pandas', '14: Data Visualzation']


#### Alternation (|)

In [61]:
# Alternation
# ex1
p = re.compile(r'(\w+)\.(bat|zip|exe)')
m = p.findall('game.exe auto.bat text.zip')
print(m)

[('game', 'exe'), ('auto', 'bat'), ('text', 'zip')]


#### Capturing Groups

In [71]:
# re.Match.groups()
# ex1

p = re.compile(r'(\w+\.\w+)\s(\w+\.\w+)')
m = p.search('game.exe auto.bat text.zip')
print(m)
print(m.groups())

<re.Match object; span=(0, 17), match='game.exe auto.bat'>
('game.exe', 'auto.bat')


In [80]:
# re.Match.group()
# ex1
pattern = r'(\w+)\W+(\w+)\W+(\w+)\W+(\w)+'
p = re.compile(pattern)
m = p.search('one,,,two:three++++++4')
print(m.group(0))
print(m.group(1))
print(m.group(2, 3, 4))

one,,,two:three++++++4
one
('two', 'three', '4')


#### Backreference

In [87]:
# Backreference
# ex1

p = re.compile(r'((\d+)\d\2)')
m = p.finditer('1234123, 11311, 123, 54345')
for string in m:
    print(string.group(1, 2))

('1234123', '123')
('11311', '11')
('434', '4')


### Flags

In [92]:
# Flag re.I
# ex1
p1 = re.compile(r'abc')
m1 = p1.findall('abc ABC aBC Abc')
p2 = re.compile(r'abc', re.I)
m2 = p2.findall('abc ABC aBC Abc')

print(m1, m2)

['abc'] ['abc', 'ABC', 'aBC', 'Abc']


In [93]:
# Flag re.S
# ex2
p1 = re.compile(r'.')
m1 = p1.findall('Aa1! \n')
p2 = re.compile(r'.', re.S)
m2 = p2.findall('Aa1! \n')

print(m1, m2)

['A', 'a', '1', '!', ' '] ['A', 'a', '1', '!', ' ', '\n']


### Model-Level Methods

#### re.match, re.search, re.findall, re.finditer

In [96]:
# ex1

match = re.match(r'abc', 'abc')
search = re.search(r'abc', 'a abc')
findall = re.findall(r'abc', 'abc abc ab bc a b c')
finditer = re.finditer(r'abc', 'abc abc ab bc a b c')
print(f'match: {match}')
print(f'search: {search}')
print(f'findall: {findall}')
print(f'finditer: {finditer}')

match: <re.Match object; span=(0, 3), match='abc'>
search: <re.Match object; span=(2, 5), match='abc'>
findall: ['abc', 'abc']
finditer: <callable_iterator object at 0x000002570A363310>


#### String-modifying methods

In [104]:
# re.split()
# ex1

p = re.compile(r'\W+')
split = p.split('The~split*method-is%powerful')
print(split)

p = re.compile(r'(\W+)')
split = p.split('The~split*method-is%powerful')
print(split)

['The', 'split', 'method', 'is', 'powerful']
['The', '~', 'split', '*', 'method', '-', 'is', '%', 'powerful']


In [110]:
# re.sub() and re.subn()
# ex1

p = re.compile(r'Toko')
sub = p.sub('Tokyo', 'Toko is a large city.')
subn = p.subn('Tokyo', 'Toko is Toko')
print(sub)
print(subn)

# ex2
sub = re.sub(r'Toko', 'Tokyo', 
             'Toko is Toko')
print(sub)

Tokyo is a large city.
('Tokyo is Tokyo', 2)
Tokyo is Tokyo


## Exercises

### Exercise 3.11

Match all phone numbers in the give text.
```
Please dial our hotline number:
93811800
93811808
93811818
```

In [2]:
text = '''Please dial our hotline number:
93811800
93811808
93811818'''

p = re.compile(_______________, re.M) # remove the underline and fill in your code

print(p.findall(text))

['93811800', '93811808', '93811818']


### Exercise 3.12

Match all the prefixes and only the names following them in the text.
```
Here are the guests coming to the party tonight:
 - Mr. Joe Bean
 - Ms. Liz Johnson
 - Mrs. Alice Dawn Fredder
 - Miss. Emma Williams

```

In [8]:
text = '''Here are the guests coming to the party tonight:
 - Mr. Joe Bean
 - Ms. Liz Johnson
 - Mrs. Alice Dawn Fredder
 - Miss. Emma Williams'''


In [15]:
p = re.compile(_________________, re.M) # remove the underline and fill in your code

print(p.findall(text))

['Mr. Joe Bean', 'Ms. Liz Johnson', 'Mrs. Alice Dawn Fredder', 'Miss. Emma Williams']


### Exercise 3.13

Match all email addresses from the given text.
```
My old email address ott@oldcompany.com has been invalidated. For the moment, I use my personal email address ott_personal@email.sg. Later when I start my own company, I will use this email: ott@myowncompany.com.sg.
```

In [17]:
text = "My old email address ott@oldcompany.com has been invalidated. For the moment, I use my personal email address ott_personal@email.sg. Later when I start my own company, I will use this email: ott@myowncompany.com.sg."

In [18]:

p = re.compile(________________, re.M) # remove the underline and fill in your code

print(p.findall(text))

['ott@oldcompany.com', 'ott_personal@email.sg', 'ott@myowncompany.com.sg']


### Exercise 3.14

Match all the names and phone numbers from the given text and create a dictionary using the names as keys and phone numbers as values.

\*Note that the correct format for a phone number is 9 digits evenly divided into three groups by two hyphens, e.g., 111-111-111. Please correct all the phone numbers you collected before you put them in the dictionary.

In [20]:
phones = '''LeBroooon James: 123-111-212
Dd Wade: 391-399-128
Steve Curry: 188381887
Kk Durant: 212-212212
Jon Harden: 371*371&196
Kiwi Leonard: -192-182-736-
Power George: 8198-18-817'''


In [25]:
# write your code here 
# You can refer to the output




{'LeBroooon James': '123-111-212', 'Dd Wade': '391-399-128', 'Steve Curry': '188-381-887', 'Kk Durant': '212-212-212', 'Jon Harden': '371-371-196', 'Kiwi Leonard': '192-182-736', 'Power George': '819-818-817'}


### Exercise 3.15

Match all the words that contains at least one character that appears more than once from the given text.

In [1]:
p = re.compile(_______________________) # remove the underline and fill in your code

p.findall('The group contains some of the most dangerous criminals in the country.')


[('contains', 'n'), ('criminals', 'i')]

### Exercise 3.16

Match all the words that contain at least 3 vowel letters (‘aeiou’) from the given text.

In [10]:
p = re.compile(_________________) # remove the underline and fill in your code

p.findall('The group contains some of the most dangerous criminals in the country.')


[('contains', 'ins'), ('dangerous', 'us'), ('criminals', 'als')]