# Finding Patterns of Text with Regex

In [2]:
import re


phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d')            # phoneNubRegex containsa Regex object
matchObject = phoneNumRegex.search('My number is 452-155-122') # returns Match Object and
                                                               # match object have a group() method that will return
                                                               # the actual matched text
print('Phone number found is: ' + matchObject.group())

Phone number found is: 452-155-122


In [4]:
import re

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d)') # adding parantheses will create group in the regex
matchObject = phoneNumRegex.search('My number is 452-155-122')
print('Phone number group 1 is: ' + matchObject.group(1))
print('Phone number group 2 is: ' + matchObject.group(2))
print('Phone number found is: ' + matchObject.group())

Phone number group 1 is: 452
Phone number group 2 is: 155-122
Phone number found is: 452-155-122


In [8]:
import re

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d)') 
matchObject = phoneNumRegex.search('My number is 452-155-122')
areaCode, mainNumber = matchObject.groups() # retrieve all groups at once, note the plural form "groups"
                                            # groups() returns a tuple of multiple values hence we can use multiple 
                                            # assignment
print(areaCode)
print(mainNumber)

452
155-122


In [9]:
import re

# to escape the (and) characters we use back slash. 
phoneNumRegex = re.compile(r'(\(\d\d\d\))-(\d\d\d-\d\d\d)') # the \(and\) escape character in the raw string passed
                                                            # to re.compile() will match actual paranthesis characters
matchObject = phoneNumRegex.search('My number is (452)-155-122')
areaCode, mainNumber = matchObject.groups()
print(areaCode)
print(mainNumber)

(452)
155-122


## Matching Multiple Groups with the Pipe

In [10]:
heroRegex = re.compile(r'Batman|Tina Fey') # pipe character i.e. | will match either Batman or Tina Fey
mo1 = heroRegex.search('Batman Tina Fey')  # When both Batman and Tina Fey occur in the searched string,the first
                                           # occurence of matching text will be returned to Match object.
print(mo1.group())

Batman


In [2]:
import re


batRegex = re.compile(r'Bat(man|mobile|copter|bat)') # using parentheses will match Bat prefix 
mo3 = batRegex.search('Batmobile and Batman lost a wheel')
print(mo3.group())
print(mo3.group(1))

Batmobile
mobile


## Optional matching with Question Mark

In [5]:
import re

batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The adventures of Batman')
mo2 = batRegex.search('Th Adventures of Batwoman')
print(mo1.group())
print(mo2.group())

Batman
Batwoman


## Matching Zero or More with the Star

In [7]:
import re
batRegex = re.compile(r'Bat(wo)*man') # match zero or more 'wo'
mo1 = batRegex.search('Hello Batwowowowowowoman')
mo2 = batRegex.search('Hello Batman')
print(mo1.group())
print(mo2.group())

Batwowowowowowoman
Batman


## Matching one or more with Plus

In [13]:
import re
batRegex = re.compile(r'Bat(wo)+man') # match one or more 'wo'
mo1 = batRegex.search('Hello Batwowowowowowoman')
mo2 = batRegex.search('Hello Batman')
print(mo1.group())
print(mo2)

Batwowowowowowoman
None


## Matching Specific Repetitions with Curly Brackets

In [14]:
import re

haRegex = re.compile(r'(Ha){3}') # {,3}, {3,}, {3,5} allowed
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

HaHaHa


# Greedy and Nongreedy Matching

In [1]:
import re

greedyRegex = re.compile(r'(Ha){3,5}') # will match longest matched string
mo1 = greedyRegex.search('Hello HaHaHaHaHa')
print(mo1.group())

nongreedyRegex = re.compile(r'(Ha){3,5}?') # will match shortest matched string
mo2 = nongreedyRegex.search('Hello HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


# The findall() method

### The search() method will return a Match object of the first matched text in the searched string. On the other hand the findall() will not return a Match object but a list of strings as long as there is no groups in the regular expression.

In [2]:
import re

phoneRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
print(phoneRegex.findall('Cell: 514-586-8559 Work: 259-855-6969'))

['514-586-8559', '259-855-6969']


In [3]:
import re

phoneRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
print(phoneRegex.findall('Cell: 514-586-8559 Work: 259-855-6969')) # findall() will return list of tuples.

[('514', '586', '8559'), ('259', '855', '6969')]


## Character Classes
1. *\d* : Any numeric digit from 0-9
2. *\D* : Any character that is not a numeric digit from 0-9
3. *\w* : Any letter, numeric digit, or the underscore character
4. *\W* : Any character that is not a letter, numeric digit, or the underscore character
5. *\s* : Any space, tab, or newline character
6. *\S* : Any character that is not space, tab or newline

In [4]:
import re

xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids')

['12 drummers', '11 pipers', '10 lords', '9 ladies', '8 maids']

# Making Your Own Character Classes

In [5]:
import re

vowelRegex = re.compile(r'[aeiouAEIOU]') # define your own character classes in square brackets
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [6]:
consonantRegex = re.compile(r'[^aeiouAEIOU]') # using a caret ^ creats negative character classes.
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

# The Caret and Dollar sign characters

In [10]:
beginWithHello = re.compile(r'^Hello')
print(beginWithHello.search('Hello world'))
print(beginWithHello.search('Hi there.'))

<_sre.SRE_Match object; span=(0, 5), match='Hello'>
None


In [25]:
endsWithNumber = re.compile(r'^\d+$') # the entire string must match if ^ and $ are used
print(endsWithNumber.search('Your age is 52 25585 and fourty222'))
print(endsWithNumber.search('Your age is 555'))
print(endsWithNumber.search('5555555'))
print(endsWithNumber.search('55 5555'))

None
None
<_sre.SRE_Match object; span=(0, 7), match='5555555'>
None


# The Wildcard character (.)

In [28]:
atRegex = re.compile(r'.at') # match any character except for a newline. It will match just one character
atRegex.findall('The cat sat in the mat on the flat blablablaat')

['cat', 'sat', 'mat', 'lat', 'aat']

## Matching Everything with Dot-Star

1. Dot character (.) means "Any single character except the newline"
2. Star character (*) means "Zero or more of the preceding character."
3. dot-star uses greedy mode.
4. Use ? for nongreedy mode

In [30]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Akash Last Name: Giri')
print(mo.group())
print(mo.group(1))
print(mo.group(2))

First Name: Akash alslasd Last Name: Giri
Akash alslasd
Giri


In [32]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo1 = greedyRegex.search('<To serve man> for dinner.>')
print(mo1.group())

<To serve man>
<To serve man> for dinner.>


## Matching Newlines with Dot character
The dot-character will match everything except newline. By passing re.DOTALL as the second argument to re.compile(), you can make the dot character match all characters

In [33]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust . \n Protect the innocent.')

<_sre.SRE_Match object; span=(0, 25), match='Serve the public trust . '>

In [34]:
noNewlineRegex = re.compile('.*', re.DOTALL)
noNewlineRegex.search('Serve the public trust . \n Protect the innocent.')

<_sre.SRE_Match object; span=(0, 48), match='Serve the public trust . \n Protect the innocent.>

# Case-Insensitive Matching

In [36]:
robocop = re.compile(r'robocop', re.I) # use re.I or re.IGNORECASE to ignore case
robocop.search('RobOcoP is a man, part machine all cop.').group()

'RobOcoP'

## Substituting Strings with the sub() Method

In [37]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [39]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice gave the secret documents to Agent Bob.')

'A**** gave the secret documents to B****.'

# Managing Complex Regexes
## Use re.VERBOSE

In [42]:
phoneRegex = re.compile(r'''(
	(\d{3}|\(\d{3}\))? # area code
	(\s|-|\.)?         # separator
	\d{3}              # first 3 digits
	(\s|-|\.)          # separator
	\d{4}              # last 4 digit
	(\s*(ext|x|ext.)\s*\d{2,5})? # extension
	)''',re.VERBOSE)

# Combining re.IGNORECASE, re.DOTALL and re.VERBOSE

In [43]:
someRegex = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)