# Regular expressions

These are strings that defines a search pattern

In [3]:
import re

In [28]:
document = '''
This document contains the contact information on the trainers at Kubrick:
 
Mr. Albert
07397-602-324
 
Mr. David
07323-234-111
 
Mr. Lorenzo
07392-244-112
 
Ms. Sarah
07221-233-222
 
Mr. Simon Duncan
07221 222 456

Mr. Marko Cubric
+44 7738 433 978

Here are some random numbers:
1234567890123456789012345678901234567890Mr2
33333*333*333
 
This is the alphabet:
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
 
a1 b2 c3 d4
'''

In [2]:
'random numbers' in document #is it in the document, but what if we want to know more

True

In [4]:
pattern = re.compile(r'random numbers')
list(pattern.finditer(document)) #we can output all matches in a list

[<re.Match object; span=(232, 246), match='random numbers'>]

In [5]:
document[232:246]

'random numbers'

In [6]:
pattern = re.compile(r'123')
list(pattern.finditer(document)) #now we get 4 matches

[<re.Match object; span=(248, 251), match='123'>,
 <re.Match object; span=(258, 261), match='123'>,
 <re.Match object; span=(268, 271), match='123'>,
 <re.Match object; span=(278, 281), match='123'>]

## Notes on regular expressions
---
~~~
General matching:
- \d : digit
- \w : word character (digits are also word characters)
- \s : spaces (matches tabs, newlines and other whitespace stuff also)

- \D : not digit
- \W : not word char
- \S : not space

Quantifiers:
* : 0 or more
+ : 1 or more
? : 0 or 1
{5} : exactly 5

Wildcard:
. : matches anything

Character sets:
[ab]  : matches a or b
[a-z] : matches everything from a-z
[^a]  : matches everything but a
~~~

In [7]:
pattern = re.compile(r'\d')
list(pattern.finditer(document))

[<re.Match object; span=(89, 90), match='0'>,
 <re.Match object; span=(90, 91), match='7'>,
 <re.Match object; span=(91, 92), match='3'>,
 <re.Match object; span=(92, 93), match='9'>,
 <re.Match object; span=(93, 94), match='7'>,
 <re.Match object; span=(95, 96), match='6'>,
 <re.Match object; span=(96, 97), match='0'>,
 <re.Match object; span=(97, 98), match='2'>,
 <re.Match object; span=(99, 100), match='3'>,
 <re.Match object; span=(100, 101), match='2'>,
 <re.Match object; span=(101, 102), match='4'>,
 <re.Match object; span=(115, 116), match='0'>,
 <re.Match object; span=(116, 117), match='7'>,
 <re.Match object; span=(117, 118), match='3'>,
 <re.Match object; span=(118, 119), match='2'>,
 <re.Match object; span=(119, 120), match='3'>,
 <re.Match object; span=(121, 122), match='2'>,
 <re.Match object; span=(122, 123), match='3'>,
 <re.Match object; span=(123, 124), match='4'>,
 <re.Match object; span=(125, 126), match='1'>,
 <re.Match object; span=(126, 127), match='1'>,
 <re.Matc

In [8]:
pattern = re.compile(r'\w')
list(pattern.finditer(document))

[<re.Match object; span=(1, 2), match='T'>,
 <re.Match object; span=(2, 3), match='h'>,
 <re.Match object; span=(3, 4), match='i'>,
 <re.Match object; span=(4, 5), match='s'>,
 <re.Match object; span=(6, 7), match='d'>,
 <re.Match object; span=(7, 8), match='o'>,
 <re.Match object; span=(8, 9), match='c'>,
 <re.Match object; span=(9, 10), match='u'>,
 <re.Match object; span=(10, 11), match='m'>,
 <re.Match object; span=(11, 12), match='e'>,
 <re.Match object; span=(12, 13), match='n'>,
 <re.Match object; span=(13, 14), match='t'>,
 <re.Match object; span=(15, 16), match='c'>,
 <re.Match object; span=(16, 17), match='o'>,
 <re.Match object; span=(17, 18), match='n'>,
 <re.Match object; span=(18, 19), match='t'>,
 <re.Match object; span=(19, 20), match='a'>,
 <re.Match object; span=(20, 21), match='i'>,
 <re.Match object; span=(21, 22), match='n'>,
 <re.Match object; span=(22, 23), match='s'>,
 <re.Match object; span=(24, 25), match='t'>,
 <re.Match object; span=(25, 26), match='h'>,
 <r

In [9]:
pattern = re.compile(r'\d\d')
list(pattern.finditer(document)) #two digits in a row

[<re.Match object; span=(89, 91), match='07'>,
 <re.Match object; span=(91, 93), match='39'>,
 <re.Match object; span=(95, 97), match='60'>,
 <re.Match object; span=(99, 101), match='32'>,
 <re.Match object; span=(115, 117), match='07'>,
 <re.Match object; span=(117, 119), match='32'>,
 <re.Match object; span=(121, 123), match='23'>,
 <re.Match object; span=(125, 127), match='11'>,
 <re.Match object; span=(143, 145), match='07'>,
 <re.Match object; span=(145, 147), match='39'>,
 <re.Match object; span=(149, 151), match='24'>,
 <re.Match object; span=(153, 155), match='11'>,
 <re.Match object; span=(169, 171), match='07'>,
 <re.Match object; span=(171, 173), match='22'>,
 <re.Match object; span=(175, 177), match='23'>,
 <re.Match object; span=(179, 181), match='22'>,
 <re.Match object; span=(202, 204), match='07'>,
 <re.Match object; span=(204, 206), match='22'>,
 <re.Match object; span=(208, 210), match='22'>,
 <re.Match object; span=(212, 214), match='45'>,
 <re.Match object; span=(24

In [10]:
pattern = re.compile(r'\d\d\d\d\d-\d\d\d-\d\d\d')
list(pattern.finditer(document)) #phone numbers but looks a bit ugly

[<re.Match object; span=(89, 102), match='07397-602-324'>,
 <re.Match object; span=(115, 128), match='07323-234-111'>,
 <re.Match object; span=(143, 156), match='07392-244-112'>,
 <re.Match object; span=(169, 182), match='07221-233-222'>]

In [11]:
pattern = re.compile(r'\d{5}-\d{3}-\d{3}')
list(pattern.finditer(document)) #phone numbers but looks a bit ugly

[<re.Match object; span=(89, 102), match='07397-602-324'>,
 <re.Match object; span=(115, 128), match='07323-234-111'>,
 <re.Match object; span=(143, 156), match='07392-244-112'>,
 <re.Match object; span=(169, 182), match='07221-233-222'>]

In [13]:
pattern = re.compile(r'\d{5}.\d{3}.\d{3}')
list(pattern.finditer(document))
#we get a few more results than we wanted

[<re.Match object; span=(89, 102), match='07397-602-324'>,
 <re.Match object; span=(115, 128), match='07323-234-111'>,
 <re.Match object; span=(143, 156), match='07392-244-112'>,
 <re.Match object; span=(169, 182), match='07221-233-222'>,
 <re.Match object; span=(202, 215), match='07221 222 456'>]

In [14]:
pattern = re.compile(r'\d{5}[\s-]\d{3}[\s-]\d{3}')
list(pattern.finditer(document)) #space is \s, now we have the phone numbers we want

[<re.Match object; span=(89, 102), match='07397-602-324'>,
 <re.Match object; span=(115, 128), match='07323-234-111'>,
 <re.Match object; span=(143, 156), match='07392-244-112'>,
 <re.Match object; span=(169, 182), match='07221-233-222'>,
 <re.Match object; span=(202, 215), match='07221 222 456'>]

In [15]:
match_list = list(pattern.finditer(document))
[m.group() for m in match_list] #list of the phone numbers

['07397-602-324',
 '07323-234-111',
 '07392-244-112',
 '07221-233-222',
 '07221 222 456']

In [16]:
#lets find the names of the male staff
pattern = re.compile(r'Mr\. [a-zA-Z]+')
list(pattern.finditer(document)) #we only get the surnames here, \. so that it knows we literally mean .

[<re.Match object; span=(78, 88), match='Mr. Albert'>,
 <re.Match object; span=(105, 114), match='Mr. David'>,
 <re.Match object; span=(131, 142), match='Mr. Lorenzo'>,
 <re.Match object; span=(185, 194), match='Mr. Simon'>]

In [24]:
pattern = re.compile(r'Mr\.\s[a-zA-Z]+\s?[a-zA-Z]+') #question mark means 0 or 1 spaces
list(pattern.finditer(document)) 

[<re.Match object; span=(78, 88), match='Mr. Albert'>,
 <re.Match object; span=(105, 114), match='Mr. David'>,
 <re.Match object; span=(131, 142), match='Mr. Lorenzo'>,
 <re.Match object; span=(185, 201), match='Mr. Simon Duncan'>]

In [25]:
pattern = re.compile(r'Ms.\s[a-zA-Z]+\s?[a-zA-Z]+') #female staff
list(pattern.finditer(document)) 

[<re.Match object; span=(159, 168), match='Ms. Sarah'>]

In [1]:
pattern = re.compile(r'M..\s[a-zA-Z]+\s?[a-zA-Z]+') #all staff staff
matches = list(pattern.finditer(document))
[m.group() for m in matches]

NameError: name 're' is not defined

In [30]:
#we added marko to there with a +44
pattern = re.compile(r'(0|\+44\s)\d{4}[\s-]\d{3}[\s-]\d{3}') #create a group 
match_list = list(pattern.finditer(document))
[m.group() for m in match_list]

['07397-602-324',
 '07323-234-111',
 '07392-244-112',
 '07221-233-222',
 '07221 222 456',
 '+44 7738 433 978']

## Code to test if a string is a valid email address

In [None]:
import re
 
def is_valid_email(email): 
    pattern = r'[a-z0-9]+[a-z0-9\.]+@[a-z0-9\.]+[a-z0-9]+'
    if re.match(pattern,email):
        #print('Valid Email')
        return True
    else: 
        #print('Invalid email')
        return False
    
#this is kates that she posted

In [None]:
def is_valid_email(x):
    import re
    pattern = re.compile(r'^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$')
    match_list = list(pattern.finditer(x))
    if len([m.group() for m in match_list]) > 0:
        return True
    return False

#my one