#### Regular Expression Lecture Note

In [1]:
import re

In [6]:
# Lecture 02: Regular Expressions with the RE Module

ssnRegex = re.compile(r'\d\d\d-\d\d-\d\d\d\d')
ssnMatch = ssnRegex.search("The two ssn's you're looking for \
                            are 557-12-8176 and 321-54-9876")
print(ssnMatch.group())

557-12-8176


In [7]:
# Lecture 03: Finding Multiple Occurrences of a Pattern Using findall()

ssnRegex = re.compile(r'\d\d\d-\d\d-\d\d\d\d')
ssnMatch = ssnRegex.findall("The two ssn's you're looking for \
                            are 557-12-8176 and 321-54-9876")
print(ssnMatch)

# Note:
# When using .search(), use .group() to get the result. [If none, cannot use .group()]
# When using .findall(), do not use .group(); just print out result directly

['557-12-8176', '321-54-9876']


In [9]:
# Lecture 04: Grouping Regular Expressions Using ()

# zip code format = 12345-6789
# pattern = r'\d\d\d\d\d-\d\d\d\d'

zipCodeRegex = re.compile(r'\d\d\d\d\d-\d\d\d\d')
zipMatch = zipCodeRegex.search('My zip code is 12345-6789.')
print(zipMatch.group())

12345-6789


In [10]:
zipCodeRegex = re.compile(r'(\d\d\d\d\d)-(\d\d\d\d)')
zipMatch = zipCodeRegex.search('My zip code is 12345-6789.')
print(zipMatch.group())
print(zipMatch.group(1))
print(zipMatch.group(2))

# Note:
# When using .search() and parens in pattern, .group() will give you the whole pattern.
# .group(1) will give you the pattern in the first parens, and
# .group(2) will give you the pattern in the second parens, and so on...

12345-6789
12345
6789


In [13]:
# zip code format = 12345-(6789)
# pattern = r'\d\d\d\d\d-\(\d\d\d\d\)'

zipCodeRegex = re.compile(r'\d\d\d\d\d-\(\d\d\d\d\)')
zipMatch = zipCodeRegex.search('My zip code is 12345-(6789).')
print(zipMatch.group())

12345-(6789)


In [14]:
# Lecture 05: The Pipe Character '|'

runRegex = re.compile(r'run(ning|ner|s)')
runMatch = runRegex.search('The marathon runner is quick.')
print(runMatch.group())
print(runMatch.group(1))

runner
ner


In [15]:
runRegex = re.compile(r'run(ning|ner|s)')
runMatch = runRegex.search('The marathon running is quick.')
print(runMatch.group())
print(runMatch.group(1))

running
ning


In [16]:
runRegex = re.compile(r'run(ning|ner|s)')
runMatch = runRegex.search('The marathon runs is quick.')
print(runMatch.group())
print(runMatch.group(1))

runs
s


In [17]:
# Lecture 06: When .search() is Used and the Regular Expression is Not Found

runRegex = re.compile(r'run(ning|ner|s)')
runMatch = runRegex.search('The rune was made of stone.')
print(runMatch)

# Note: cannot use runMatch.group() in this case. Will get error message.

None


In [18]:
# Lecture 07: Matching 0 or 1 Group Using "?"

salesRegex = re.compile(r'sales(wo)?man')  # wo appears 0 or 1 time
salesMatch = salesRegex.search('I was a salesman!')
print(salesMatch.group())

salesman


In [19]:
salesRegex = re.compile(r'sales(wo)?man')  # wo appears 0 or 1 time
salesMatch = salesRegex.search('I was a saleswoman!')
print(salesMatch.group())

saleswoman


In [21]:
# MM/DD/YYYY
# example date: 12/13/2005

dateRegex = re.compile(r'\d\d/\d\d/\d\d\d\d')
dateMatch = dateRegex.search('The date is 12/12/2016')
print(dateMatch.group())

12/12/2016


In [24]:
# There are three other possible cases:
# 2/5/2012
# 12/9/1995
# 3/24/2001

dateRegex = re.compile(r'\d(\d)?/\d(\d)?/\d\d\d\d')
dateMatch = dateRegex.search('The date is 2/5/2012')
print(dateMatch.group())

dateMatch = dateRegex.search('The date is 12/9/1995')
print(dateMatch.group())

dateMatch = dateRegex.search('The date is 3/24/2001')
print(dateMatch.group())

2/5/2012
12/9/1995
3/24/2001


In [25]:
# Lecture 08: Matching 0 or More Groups Using "*"

snakeRegex = re.compile(r'snake(s)*')  # s appears 0 or more times
snakeMatch = snakeRegex.search('I saw a green snake.')
print(snakeMatch.group())

snakeMatch = snakeRegex.search('I saw green snakes.')
print(snakeMatch.group())

snakeMatch = snakeRegex.search('I saw green snakessssssss.')
print(snakeMatch.group())

snake
snakes
snakessssssss


In [27]:
# Lecture 09: Matching 1 or More Groups Using "+"

snakeRegex = re.compile(r'snake(s)+')  # s appears 1 or more times
snakeMatch = snakeRegex.search('I saw a green snake.')
print(snakeMatch)

snakeMatch = snakeRegex.search('I saw green snakes.')
print(snakeMatch.group())

snakeMatch = snakeRegex.search('I saw green snakessssssss.')
print(snakeMatch.group())

None
snakes
snakessssssss


In [29]:
# Lecture 10: Escaping “?”, “*”, and “+”

mathRegex = re.compile(r'(\d)+ \+ (\d)+ \* (\d)+ = \?')
mathMatch = mathRegex.search('34 + 43 * 9 = ?')
print(mathMatch.group())

34 + 43 * 9 = ?


'34 + 43 * 9 = ?'

In [31]:
# Lecture 11: Pattern Matching Using {number of reps}

laRegex = re.compile(r'(la){4}')
laMatch = laRegex.search("She said \"lalalala\" I can't hear you")
print(laMatch.group())

lalalala


In [42]:
dateRegex = re.compile(r'((\d)?\d/(\d)?\d/\d\d\d\d(, )?){4}')
dateMatch = dateRegex.search('The dates are 1/12/1999, 10/19/1987, 7/8/1985, 11/9/1980.')
print(dateMatch.group())

1/12/1999, 10/19/1987, 7/8/1985, 11/9/1980


In [43]:
# Use findall() for this problem:
dateRegex = re.compile(r'\d?\d/\d?\d/\d\d\d\d')
dateMatch = dateRegex.findall('The dates are 1/12/1999, 10/19/1987, 7/8/1985, 11/9/1980.')
print(dateMatch)

# Note:
# Be careful when using parens in pattern when using findall(). 
# Parens give groups which show up in results as tuples.

['1/12/1999', '10/19/1987', '7/8/1985', '11/9/1980']


In [44]:
# Lecture 12: Pattern Matching Using {min,max} 

blahRegex = re.compile(r'(blah){2,4}')
blahMatch = blahRegex.search('blahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblah')
print(blahMatch.group())

blahblah
blahblahblah
blahblahblahblah


In [45]:
blahRegex = re.compile(r'(blah){,5}')  # blah appears 0 to 5 times

blahMatch = blahRegex.search('Nothing')
print(blahMatch.group())

blahMatch = blahRegex.search('blah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblahblahblahblahblahblahblah')
print(blahMatch.group())


blah
blahblah
blahblahblah
blahblahblahblah
blahblahblahblahblah
blahblahblahblahblah


In [47]:
blahRegex = re.compile(r'(blah){2,}')  # blah appears 2 or more times

blahMatch = blahRegex.search('Nothing')
print(blahMatch)

blahMatch = blahRegex.search('blah')
print(blahMatch)

blahMatch = blahRegex.search('blahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblahblah')
print(blahMatch.group())

blahMatch = blahRegex.search('blahblahblahblahblahblahblahblahblahblah')
print(blahMatch.group())

None
None
blahblah
blahblahblah
blahblahblahblah
blahblahblahblahblah
blahblahblahblahblahblahblahblahblahblah


In [50]:
# Lecture 13: Greedy and Non-Greedy Matching

wordRegex = re.compile(r'(\w){1,3}')
wordMatch = wordRegex.search('Eyes')
print(wordMatch.group())

wordRegex = re.compile(r'(\w){1,3}?')  # put '?' in the end for non-greedy match
wordMatch = wordRegex.search('Eyes')
print(wordMatch.group())

Eye
E


In [54]:
# Lecture 14: findall() with 0 or 1 groups

zipRegex = re.compile(r'\d\d\d\d\d')
addresses = """ 
360 Clark Rd.
Jackson, NJ 08527

563 Alderwood St.
Waltham, MA 02453

70 High Ave.
Boca Raton, FL 33428
"""
print(zipRegex.search(addresses).group()) # .search() finds only the first pattern in the string 
print(zipRegex.findall(addresses))  # .findall() finds all patterns in the string 

08527
['08527', '02453', '33428']


In [55]:
# Important Note:
# If no parens with .findall(), then output the whole pattern.
# If one set of parens with .findall(), then output only what's inside the parens.

zipRegex = re.compile(r'\d(\d\d\d)\d')
addresses = """ 
360 Clark Rd.
Jackson, NJ 08527

563 Alderwood St.
Waltham, MA 02453

70 High Ave.
Boca Raton, FL 33428
"""
print(zipRegex.findall(addresses))

['852', '245', '342']


In [58]:
# Lecture 15: findall() with 2 or more groups

zipRegex = re.compile(r'((\d\d\d\d\d)-\d\d\d\d)')
addresses = """ 
360 Clark Rd.
Jackson, NJ 08527-9807

563 Alderwood St.
Waltham, MA 02453-5565

70 High Ave.
Boca Raton, FL 33428-1234
"""
print(zipRegex.findall(addresses))

# Output tuples. Each tuple consists of all the groups.

[('08527-9807', '08527'), ('02453-5565', '02453'), ('33428-1234', '33428')]


In [59]:
# Lecture 17: Making Your Own Character Classes Using []

# Some of regex character classes are: \d, \D, \w, \W, \s, \S
# Note that [] represents one SINGLE char, no matter how many chars are inside the [].

letterRegex = re.compile(r'[tirT]') # any of the letters inside []
letterMatch = letterRegex.findall('The squirrel climbed up the tree.')
print(letterMatch)

['T', 'i', 'r', 'r', 'i', 't', 't', 'r']


In [60]:
letterRegex = re.compile(r'[sqtire]{2}') # two of any of the letters inside []
letterMatch = letterRegex.findall('The squirrel climbed up the tree.')
print(letterMatch)

['sq', 'ir', 're', 'tr', 'ee']


In [61]:
letterRegex = re.compile(r'[sqtire]{2,}') # two or more of any of the letters inside []
letterMatch = letterRegex.findall('The squirrel climbed up the tree.')
print(letterMatch)

['sq', 'irre', 'tree']


In [62]:
letterRegex = re.compile(r'[sqtire]{2,3}') # 2-3 letters together of any of the letters inside []
letterMatch = letterRegex.findall('The squirrel climbed up the tree.')
print(letterMatch)

['sq', 'irr', 'tre']


In [64]:
letterRegex = re.compile(r'[a-dA-P]') # 2-3 letters together of any of the letters inside []
letterMatch = letterRegex.findall('Peter Piper Picked A Peck Of Pickled Peppers.')
print(letterMatch)

['P', 'P', 'P', 'c', 'd', 'A', 'P', 'c', 'O', 'P', 'c', 'd', 'P']


In [66]:
# Lecture 18: Negative Character Classes Using ^

letterRegex = re.compile(r'[^tirT]') # any of the letters inside []
letterMatch = letterRegex.findall('The squirrel.')
print(letterMatch)

['h', 'e', ' ', 's', 'q', 'u', 'e', 'l', '.']


In [74]:
# Lecture 19: Using ^ to match a regular expression at the beginning of a string

DrWhoRegex = re.compile(r'^rude')  # 'rude' needs to appear at the beginning of the string to be searched
print(DrWhoRegex.search('rude and not ginger'))
print(DrWhoRegex.search('That was rude'))

<_sre.SRE_Match object; span=(0, 4), match='rude'>
None


In [75]:
print(DrWhoRegex.search('rude and not ginger').group())

# Note: adding .group() to the end of the print statement is preferred when a match is found because
#       for this display:  <_sre.SRE_Match object; span=(0, 4), match='rude'>, match = ' ' can only
#       display up to 50 chars. However, .group() is not subject to this limit.

rude


In [76]:
# Lecture 20: Using $ to match a regular expression at the end of a string

endRegex = re.compile(r'rude$')
print(endRegex.search('That was rude'))

<_sre.SRE_Match object; span=(9, 13), match='rude'>


In [80]:
# Lecture 21: Using ^ at the beginning and $ at the end of a regex pattern simultaneously

wordRegex = re.compile(r'^\w+$')
print(wordRegex.search('Pakistan'))
print(wordRegex.search('South Africa'))

<_sre.SRE_Match object; span=(0, 8), match='Pakistan'>
None


In [84]:
# Lecture 22: Using the “.” Character By Itself

ightRegex = re.compile(r'.ight')  # '.' matches any single character except newline '\n'
ightRegex.findall("I might turn right at the light so that I don't miss my flight.")

['might', 'right', 'light', 'light']

In [86]:
# Lecture 23: Using the “.” Character With {min,max}

ightRegex = re.compile(r'.{2}ight')   
ightRegex.findall("I might turn right at the light so that I don't miss my flight.")

[' might', ' right', ' light', 'flight']

In [87]:
ightRegex = re.compile(r'.{2,4}ight')   
ightRegex.findall("I might turn right at the light so that I don't miss my flight.")

['I might', 'rn right', 'he light', 'y flight']

In [92]:
# Lecture 24:  Using “.*” to find patterns in strings containing words of unknown length

countryCapRegex = re.compile(r'The capital city of .* is .*')
print(countryCapRegex.search("The capital city of Hungry is Budapest").group())
print(countryCapRegex.search("The capital city of Spain is Madrid").group())
print(countryCapRegex.search("The capital city of United Kingdom is London").group())

The capital city of Hungry is Budapest
The capital city of Spain is Madrid
The capital city of United Kingdom is London


In [100]:
# Lecture 25: Greedy and Non-Greedy Matching with “.*” and “.*?”

# greedy match:
numRegex = re.compile(r'0.*5')  # by default, regex is 'greedy'.
print(numRegex.findall('012345012345'))

# non-greedy match:
numRegex = re.compile(r'0.*?5')  # to make regex 'non-greedy', put '?' at the end, such as .*? or .+?
print(numRegex.findall('012345012345'))

['012345012345']
['012345', '012345']


In [99]:
# Note: with findall() and one set of parens, the output is only what is inside that one set of parens.

numRegex = re.compile(r'(0.*)5')   
print(numRegex.findall('012345012345'))
 
numRegex = re.compile(r'(0.*?)5')   
print(numRegex.findall('012345012345'))

['01234501234']
['01234', '01234']


In [101]:
# Lecture 26: Matching “\n” with DOTALL

# the string:
DrSeuss = "The Grinch hated Christmas\nThe whole Christmas season!\nNow, please don't ask why."
print(DrSeuss)

The Grinch hated Christmas
The whole Christmas season!
Now, please don't ask why.


In [104]:
nonDotallRegex = re.compile(r'.*')
nonDotallRegex.search(DrSeuss).group()

'The Grinch hated Christmas'

In [105]:
DotallRegex = re.compile(r'.*', re.DOTALL)
DotallRegex.search(DrSeuss).group()

"The Grinch hated Christmas\nThe whole Christmas season!\nNow, please don't ask why."

In [106]:
# add print():
DotallRegex = re.compile(r'.*', re.DOTALL)
print(DotallRegex.search(DrSeuss).group())

The Grinch hated Christmas
The whole Christmas season!
Now, please don't ask why.


In [107]:
# Lecture 27: Using re.IGNORECASE/re.I to do a case insensitive regular expression match

twisterRegex = re.compile(r'[p]', re.IGNORECASE)
twisterRegex.findall("Peter Piper Picked a Peck of Pickled Peppers")  # returns both P and p

['P', 'P', 'p', 'P', 'P', 'P', 'P', 'p', 'p']

In [109]:
# Lecture 28: Finding and Replacing a Regular Expression using the sub() function

dateRegex = re.compile(r'(\d)?\d/(\d)?\d/\d\d\d\d')
dateRegex.search("Today's date is 11/25/2014")

<_sre.SRE_Match object; span=(16, 26), match='11/25/2014'>

In [110]:
# re.sub(pat, replacement, str)

dateRegex.sub("11/25/2017", "Today's date is 11/25/2014") 

"Today's date is 11/25/2017"

In [67]:
# Lecture 29: Partial Find and Replace Using Sub()

passwordRegex = re.compile(r'password: \w+')
print(passwordRegex.search('This is the password: password12345'))

<_sre.SRE_Match object; span=(12, 35), match='password: password12345'>


In [70]:
passwordRegex = re.compile(r'password: (\w)\w+')  # (\w) is group(1)
subRegex = passwordRegex.sub(r'password: \1****', 'This is the password: password12345')
print(subRegex)

# Note: the replacement pattern is just how it is writen. Meta-characters here do not mean anything.

This is the password: p****


In [None]:
# Lecture 30: Using Verbose Mode

# Option re.VERBOSE helps to make long and complex regular expressions easier to understand via formatting & comments.

# 1. Whitespace is ignored. Spaces, tabs, and newline returns are not matched as spaces, tabs, and carriage returns. 
#    They're not matched at all. 
#    If you want to match a space in a verbose regular expression, you'll need to escape it by 
#    putting a backslash in front of it.

# 2. Comments are ignored. A comment in a verbose regular expression is just like a comment in Python code: 
#    it starts with a # character and goes until the end of the line. 
#    In this case it's a comment within a multi-line string instead of within your source code, 
#    but it works the same way.

# the regex before:
dateRegex = re.compile(r'(\d)?\d/(\d)?\d/\d\d\d\d')

# use re.VERBOSE instead:
dateRegex = re.compile(r'''
((\d)?\d  # month
/(        # first slash
\d)?\d    # day
/         # second slash
\d\d\d\d  # year
''', re.VERBOSE)