# Python regular expressions tutorial
___

In [2]:
import re

In [5]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('data', 53)]

## Simple

In [99]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
bat
'''

In [4]:
sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'start', re.I)

matches = pattern.search(sentence)

print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [16]:
pattern = re.compile(r'\bHa')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(66, 68), match='Ha'>
<re.Match object; span=(69, 71), match='Ha'>


In [24]:
pattern = re.compile(r'end$')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(41, 44), match='end'>


In [34]:
pattern = re.compile(r'[89]00[.-]\d\d\d[.-]\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [41]:
pattern = re.compile(r'[^b]at')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(267, 270), match='cat'>
<re.Match object; span=(271, 274), match='mat'>
<re.Match object; span=(275, 278), match='pat'>


In [102]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [56]:
pattern = re.compile(r'Mr\.?\s\w+')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [60]:
pattern = re.compile(r'M(r|s|rs)\.?\s\w+')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [61]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s\w+')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [96]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s\w+')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [104]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234


###### match the string beginning

In [108]:
pattern = re.compile(r'Start')
matches = pattern.match(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>


###### search through entire string

In [110]:
pattern = re.compile(r'sentence')
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(8, 16), match='sentence'>


###### flags

In [114]:
pattern = re.compile(r'start', re.IGNORECASE)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [115]:
pattern = re.compile(r'start', re.I)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>


## Data.txt

In [35]:
with open('data.txt') as f:
    contents = f.read()
    matches = pattern.finditer(contents)
    for match in matches:
        print(match)

<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1091, 1103), match='900-555-3205'>
<re.Match object; span=(1439, 1451), match='800-555-6089'>
<re.Match object; span=(1790, 1802), match='800-555-7100'>
<re.Match object; span=(2051, 2063), match='900-555-5118'>
<re.Match object; span=(2826, 2838), match='900-555-5428'>
<re.Match object; span=(3284, 3296), match='800-555-8810'>
<re.Match object; span=(3971, 3983), match='900-555-9598'>
<re.Match object; span=(4945, 4957), match='800-555-2420'>
<re.Match object; span=(5566, 5578), match='900-555-3567'>
<re.Match object; span=(6189, 6201), match='800-555-3216'>
<re.Match object; span=(6889, 6901), match='900-555-7755'>
<re.Match object; span=(7864, 7876), match='800-555-1372'>
<re.Match object; span=(8741, 8753), match='900-555-6426'>


## E-mails

In [69]:
import re

emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)')

matches = pattern.finditer(emails)

for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [71]:
import re

emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')

matches = pattern.finditer(emails)

for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [None]:
'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'

## URLs

In [81]:
import re

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?\w+\.\w+')

# subbed_urls = pattern.sub(r'\2\3', urls)

# print(subbed_urls)

matches = pattern.finditer(urls)

for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [89]:
import re

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

# sibbed_urls = pattern.sub(r'\2\3', urls)
# print(subbed_urls)

matches = pattern.finditer(urls)

for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


In [90]:
import re

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

sibbed_urls = pattern.sub(r'\2\3', urls)
print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



In [95]:
import re

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

matches = pattern.findall(urls)

for match in matches:
    print(match)

('www.', 'google', '.com')
('', 'coreyms', '.com')
('', 'youtube', '.com')
('www.', 'nasa', '.gov')


In [74]:
subbed_urls

'\ngoogle.com\ncoreyms.com\nyoutube.com\nnasa.gov\n'

In [None]:
https?://(www\.)?(\w+)(\.\w+)