# Pattern Matching with Regular Expressions

In [1]:
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
    return True

In [2]:
print('415-555-4242 is a phone number:')
print(isPhoneNumber('415-555-4242'))
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))

415-555-4242 is a phone number:
True
Moshi moshi is a phone number:
False


## Creating Regex Objects

In [3]:
import re

phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

## Matching Regex Objects

In [4]:
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


## Grouping with Parentheses

In [5]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')

In [6]:
mo.group(1)

'415'

In [7]:
mo.group(2)

'555-4242'

In [8]:
mo.group()

'415-555-4242'

In [9]:
mo.groups()

('415', '555-4242')

In [10]:
areaCode, mainNumber = mo.groups()

In [11]:
areaCode

'415'

In [12]:
mainNumber

'555-4242'

In [13]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
mo.group(1)

'(415)'

## Matching Multiple Groups with the Pipe

In [14]:
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
mo1.group()

'Batman'

In [15]:
mo2 = heroRegex.search('Tina Fey and Batman.')
mo2.group()

'Tina Fey'

In [16]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [17]:
mo.group(1)

'mobile'

## Optional Matching with the Question Mark

In [18]:
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [19]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [20]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
mo1.group()

'415-555-4242'

In [21]:
mo2 = phoneRegex.search('My number is 555-4242')
mo2.group()

'555-4242'

## Matching Zero or More with the Star

In [22]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [23]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [24]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

## Matching One or More with the Plus

In [25]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [26]:
mo2 = batRegex.search('The Adventures of Batwowowowoman')
mo2.group()

'Batwowowowoman'

In [27]:
mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

True

## Matching Specific Repetitions with Curly Brackets

In [28]:
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [29]:
mo2 = haRegex.search('Ha')
mo2 == None

True

## Greedy and Nongreedy Matching

In [30]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [31]:
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'

In [32]:
HaRegex = re.compile(r'(Ha){3,5}')
mo3 = HaRegex.search('HaHaHaHaHa')
mo3.group()

'HaHaHaHaHa'

## The findall() Method

In [33]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [34]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [35]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

## Character Classes

In [36]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

## Making Your Own Character Classes

In [37]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('Robocop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [38]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('Robocop eats baby food. BABY FOOD.')

['R',
 'b',
 'c',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

## The Caret and Dollar Sign Characters

In [39]:
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello world!')

<re.Match object; span=(0, 5), match='Hello'>

In [40]:
beginsWithHello.search('He said hello.') == None

True

In [41]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [42]:
endsWithNumber.search('Your number is forty two.') == None

True

In [43]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [44]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [45]:
wholeStringIsNum.search('12 34567890') == None

True

## The Wildcard Character

In [46]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

## Matching Everything with Dot-Star

In [47]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(1)

'Al'

In [48]:
mo.group(2)

'Sweigart'

In [49]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [50]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

## Matching Newlines with the Dot Character

In [52]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [54]:
newlineRegex = re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

## Review of Regex Symbols

* This chapter covered a lot of notation, so here’s a quick review of what you learned:

* The ? matches zero or one of the preceding group.

* The * matches zero or more of the preceding group.

* The + matches one or more of the preceding group.

* The {n} matches exactly n of the preceding group.

* The {n,} matches n or more of the preceding group.

* The {,m} matches 0 to m of the preceding group.

* The {n,m} matches at least n and at most m of the preceding group.

* {n,m}? or *? or +? performs a nongreedy match of the preceding group.

* ^spam means the string must begin with spam.

* spam$ means the string must end with spam.

* The . matches any character, except newline characters.

* \d, \w, and \s match a digit, word, or space character, respectively.

* \D, \W, and \S match anything except a digit, word, or space character, respectively.

* [abc] matches any character between the brackets (such as a, b, or c).

* [^abc] matches any character that isn’t between the brackets.



## Case-Insensitive Matching

In [55]:
robocop = re.compile(r'robocop', re.I)
robocop.search('Robocop is part man, part machine, all cop.').group()

'Robocop'

In [56]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [57]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

## Substituting Strings with the sub() Method

In [58]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [209]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

# Project: Phone Number and Email Address Extractor

## Step 1: Create a Regex for Phone Numbers

In [61]:
#! python3
# phoneAndEmail.py - Finds phone numbers and email addresses on the clipboard.

import pyperclip, re

phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?                # area code
    (\s|-|\.)?                        # separator
    (\d{3})                           # first 3 digits
    (\s|-|\.)                         # separator
    (\d{4})                           # last 4 digits
    (\s*(ext|x|ext.)\s*(\d{2,5}))?    # extension
    )''', re.VERBOSE)

# TODO: Create email regex.

# TODO: Find matches in clipboard text.

# TODO: Copy results to the clipboard.

ModuleNotFoundError: No module named 'pyperclip'

## Step 2: Create a Regex for Email Addresses

In [63]:
# Create email regex.
emailRegex = re.compile(r'''(
    [a-zA-Z0-9._%+-]+      # username
    @                      # @ symbol
    [a-zA-Z0-9.-]+         # domain name
    (\.[a-zA-Z]{2,4})      # dot-something
    )''', re.VERBOSE)

## Step 3: Find All Matches in the Clipboard Text

In [65]:
# Find matches in clipboard text.
text = str(pyperclip.paste())
matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
    
for groups in emailRegex.findall(text):
    matches.append(groups[0])

NameError: name 'pyperclip' is not defined

## Step 4: Join the Matches into a String for the Clipboard

In [66]:
# Copy results to the clipboard.
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

NameError: name 'matches' is not defined

# Practice Questions

In [160]:
numRegex = re.compile(r'^\d{1,3}(,\d{3})*$')
mo = numRegex.search('12,341,567')
mo.group(0)

'12,341,567'

In [83]:
numRegex = re.compile(r'^[A-Z][a-z]*\sNakamoto')
mo = numRegex.search('Satoshi Nakamoto')
mo.group(0)

'Satoshi Nakamoto'

In [87]:
numRegex = re.compile(r'(Alice|Bob|Carol)\s(eats|pets|throws)\s(apples|cats|baseballs)\.',re.IGNORECASE)
mo = numRegex.search('ALICE THROWS FOOTBALLS.')
mo.group(0)

AttributeError: 'NoneType' object has no attribute 'group'

# Practice Questions


## Strong Password Detection

In [205]:
def isStrongPassword(password): #8 caracteres, upper e | lower,1 digito
    password_length_Regex = re.compile(r'.{8,}')
    password_upper_Regex = re.compile(r'[A-Z]')
    password_lower_Regex = re.compile(r'[a-z]')
    password_digit_Regex = re.compile(r'[0-9]')
    
    if password_length_Regex.search(password) == None:
        return False
    elif password_upper_Regex.search(password) == None:
        return False
    elif password_lower_Regex.search(password) == None:
        return False
    elif password_digit_Regex.search(password) == None:
        return False
    else:
        return True

In [207]:
isStrongPassword('aBcDeF7H')

True

## Regex Version of strip()

In [346]:
def regexStrip(text, remove=''):
    #remove whitespace from begin and end
    if remove == '':
        remove = '\s'
    
    regsp = re.compile(r'^{}+|{}+$'.format(remove,remove))
    return regsp.sub("", text)
        
     

In [347]:
regexStrip('    RegExr was created by gskinner.com, and is proudly hosted by Media Temple.     ', '')

'RegExr was created by gskinner.com, and is proudly hosted by Media Temple.'

In [348]:
regexStrip('xxxxxxxxxxxRegExr was created by gskinner.com, and is proudly hosted by Media Temple.xxxxxxxxxx', 'x')

'RegExr was created by gskinner.com, and is proudly hosted by Media Temple.'